## Final Report 
### Imports

In [1]:
# Imports

# Display plots directly in the notebook instead of in a new window
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import patsy
import statsmodels.api as sm
import os 
from nltk.sentiment.vader import SentimentIntensityAnalyzer 


### Reading in the UC School csv files

In [2]:
# Read in Berekeley
df_ucb = pd.read_csv(os.path.join('Datasets', 'UC_Berkeley.csv'), 
                     dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Davis
df_ucd = pd.read_csv(os.path.join('Datasets', 'UC_Davis.csv'), 
                     dtype={'date': 'str', 'word_count':int, 'text':'str', 'school':'str'})

# Read in Irvine
#df_uci = pd.read_csv(os.path.join('Datasets', 'UC_Irvine.csv'), 
#                     dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Los Angeles
df_ucla = pd.read_csv(os.path.join('Datasets', 'UC_Los_Angeles.csv'), 
                      dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Merced
df_ucm = pd.read_csv(os.path.join('Datasets', 'UC_Merced.csv'), 
                     dtype={'date': 'str', 'word_count':int, 'content':'str', 'school':'str'})

# Read in Riverside
#df_ucr = pd.read_csv(os.path.join('Datasets', 'UC_Riverside.csv'), 
#                     dtype={'date': 'str', 'word_count':'str', 'school':'str'})

# Read in San Diego
df_ucsd = pd.read_csv(os.path.join('Datasets', 'UC_San_Diego.csv'), 
                      dtype={'date': 'str', 'word_count':'str', 'school':'str'})

# Read in Santa Barbra
df_ucsb = pd.read_csv(os.path.join('Datasets', 'UC_Santa_Barbara.csv'),
                      dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Santa Cruz
df_ucsc = pd.read_csv(os.path.join('Datasets', 'UC_Santa_Cruz.csv'), 
                      dtype={'date': 'str', 'word_count':int, 'content':'str', 'school':'str'})


### Take a look at UC Berkeley 

In [3]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucb['state'] = 'CA'

# Dropping the 'link' column associated with each update
df_ucb = df_ucb.drop(['link'], axis =1)

# Reordering the columns into a more readable format
df_ucb = df_ucb[['date','school','state','content','word_count']]

# Printing dataframe so we can view all entries
print(df_ucb.to_string())

           date    school state                                            content  word_count
0     18-May-20  Berkeley    CA  We are writing today to share news about budge...         397
1     16-May-20  Berkeley    CA  Members of the remarkable and resilient gradua...        1368
2     14-May-20  Berkeley    CA  A simulated Commencement 2020 will take place ...        2291
3     12-May-20  Berkeley    CA  Last Tuesday, our campus participated in #Givi...         887
4     12-May-20  Berkeley    CA  There are two major budget concerns Chancellor...         354
5      6-May-20  Berkeley    CA  We are writing today to provide an update on o...         680
6      6-May-20  Berkeley    CA  We understand that many faculty, staff, and st...         605
7      4-May-20  Berkeley    CA  Tomorrow is #GivingTuesdayNow, a new global da...         394
8   29-April-20  Berkeley    CA  The Bay Area’s shelter-in-place order has been...         247
9   23-April-20  Berkeley    CA  In response to co

### Take a look at UC Davis 

In [4]:
# Renaming text column to content
df_ucd = df_ucd.rename(columns = {'text':'content'})

# Adding a 'state' column and setting its default value to 'CA'
df_ucd['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucd = df_ucd[['date','school','state','content']]

# Printing dataframe so we can view all entries
print(df_ucd.to_string())

         date school state                                            content
0   31-Jan-20  Davis    CA  Fight it like the flu\nUpdated 5 p.m. Jan. 31 ...
1    1-Feb-20  Davis    CA  Yolo County info page\nYolo County, in which t...
2    3-Feb-20  Davis    CA  Screening at airports\nUpdated 9 p.m. Feb. 3  ...
3    7-Feb-20  Davis    CA  No reports of coronavirus\nUpdated 11:30 a.m. ...
4   10-Feb-20  Davis    CA  UC Davis Live\nUpdated Feb. 10 Watch UC Davis ...
5   13-Feb-20  Davis    CA  Travel FAQ\nUpdated 11:30 p.m. Feb. 13  Global...
6   25-Feb-20  Davis    CA  New travel notices\nUpdated 6:55 p.m. Feb. 25 ...
7   27-Feb-20  Davis    CA  Level 3 countries off-limits\nUpdated 7 a.m. F...
8   27-Feb-20  Davis    CA  COVID-19 patient\nUpdated 7:50 a.m. Feb. 27  I...
9   27-Feb-20  Davis    CA  3 in isolation\nUpdated 3 p.m. Feb. 27  Chance...
10  27-Feb-20  Davis    CA  Media availability\nUpdated 4 p.m. Feb. 27  UC...
11  28-Feb-20  Davis    CA  Reject xenophobia\nUpdated 10 a.m. F

### Take a look at UC Irvine 

In [5]:
# Take a look at Irvine
df_uci['state'] = 'CA'
df_uci.head()

NameError: name 'df_uci' is not defined

### Take a look at UC Los Angeles 

In [None]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucla['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucla = df_ucla[['date','school','state','word_count']]

# Printing dataframe so we can view all entries
print(df_ucla.to_string())

### Take a look at UC Merced 

In [None]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucm['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucm = df_ucm[['date','school','state','content', 'word_count']]

# Printing dataframe so we can view all entries
print(df_ucm.to_string())

### Take a look at UC Riverside 

In [None]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucr['state'] = 'CA'
df_ucr.head()

### Take a look at UC San Diego 

In [None]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucsd['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucsd = df_ucsd[['date','school','state','word_count']]

# Printing dataframe so we can view all entries
print(df_ucsd.to_string()) # You will notice NaNs at the bottom of the data frame

In [None]:
# Dropping the empty rows at the bottom of df_ucsd
df_ucsd = df_ucsd.dropna()

# Now that the 4 rows with missing data on the bottom with the NaN values have been removed 
# We can convert word_count column back into an integer
convert_dict = {'word_count': int}
df_ucsd = df_ucsd.astype(convert_dict)

# Check out our updated df_ucsd
print(df_ucsd.to_string()) 

### Take a look at UC Santa Barbara 

In [None]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucsb['state'] = 'CA'
df_ucsb = df_ucsb.drop(['link'], axis =1)

# Reordering the columns into a more readable format
df_ucsb = df_ucsb[['date','school','state','content', 'word_count']]

# Printing dataframe so we can view all entries
print(df_ucsb.to_string())

### Take a look at Santa Cruz

In [None]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucsc['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucsc = df_ucsc[['date','school','state','content', 'word_count']]

# Printing dataframe so we can view all entries
print(df_ucsc.to_string())

### Combining all of our UC data frames

In [None]:
# concatenating the dataframes together
df = pd.concat([df_ucsc, df_ucm, df_ucd, df_ucb, df_ucsd, df_ucsb, df_ucla], sort= True)
df = df[['date','school','state','content', 'word_count']]

# looking at the entire dataframe
print(df.to_string())

### Reading in Texas school csv files

In [None]:
# Read in Texas State University (UNICODEDECODE ERRROR!!!) GGEZ
#df_tsu = pd.read_csv(os.path.join('Datasets', 'Texas_State_University.csv'), 
#                     dtype={'date': 'str', 'word_count':int, 'school':'str'})