## Final Report 
### Imports

In [2]:
# Imports

# Display plots directly in the notebook instead of in a new window
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import patsy
import statsmodels.api as sm
import os 
from nltk.sentiment.vader import SentimentIntensityAnalyzer 


### Reading in the UC School csv files

In [3]:
# Read in Berekeley
df_ucb = pd.read_csv(os.path.join('Datasets', 'UC_Berkeley.csv'), 
                     dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Davis
df_ucd = pd.read_csv(os.path.join('Datasets', 'UC_Davis.csv'), 
                     dtype={'date': 'str', 'word_count':int, 'text':'str', 'school':'str'})

# Read in Irvine
#df_uci = pd.read_csv(os.path.join('Datasets', 'UC_Irvine.csv'), 
#                     dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Los Angeles
df_ucla = pd.read_csv(os.path.join('Datasets', 'UC_Los_Angeles.csv'), 
                      dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Merced
df_ucm = pd.read_csv(os.path.join('Datasets', 'UC_Merced.csv'), 
                     dtype={'date': 'str', 'word_count':int, 'content':'str', 'school':'str'})

# Read in Riverside
#df_ucr = pd.read_csv(os.path.join('Datasets', 'UC_Riverside.csv'), 
#                     dtype={'date': 'str', 'word_count':'str', 'school':'str'})

# Read in San Diego
df_ucsd = pd.read_csv(os.path.join('Datasets', 'UC_San_Diego.csv'), 
                      dtype={'date': 'str', 'word_count':'str', 'school':'str'})

# Read in Santa Barbra
df_ucsb = pd.read_csv(os.path.join('Datasets', 'UC_Santa_Barbara.csv'),
                      dtype={'date': 'str', 'word_count':int, 'school':'str'})

# Read in Santa Cruz
df_ucsc = pd.read_csv(os.path.join('Datasets', 'UC_Santa_Cruz.csv'), 
                      dtype={'date': 'str', 'word_count':int, 'content':'str', 'school':'str'})


### Take a look at UC Berkeley 

In [4]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucb['state'] = 'CA'

# Dropping the 'link' column associated with each update
df_ucb = df_ucb.drop(['link'], axis =1)

# Reordering the columns into a more readable format
df_ucb = df_ucb[['date','school','state','content','word_count']]

# Printing dataframe so we can view all entries
print(df_ucb.to_string())

           date    school state                                            content  word_count
0     18-May-20  Berkeley    CA  We are writing today to share news about budge...         397
1     16-May-20  Berkeley    CA  Members of the remarkable and resilient gradua...        1368
2     14-May-20  Berkeley    CA  A simulated Commencement 2020 will take place ...        2291
3     12-May-20  Berkeley    CA  Last Tuesday, our campus participated in #Givi...         887
4     12-May-20  Berkeley    CA  There are two major budget concerns Chancellor...         354
5      6-May-20  Berkeley    CA  We are writing today to provide an update on o...         680
6      6-May-20  Berkeley    CA  We understand that many faculty, staff, and st...         605
7      4-May-20  Berkeley    CA  Tomorrow is #GivingTuesdayNow, a new global da...         394
8   29-April-20  Berkeley    CA  The Bay Area’s shelter-in-place order has been...         247
9   23-April-20  Berkeley    CA  In response to co

### Take a look at UC Davis 

In [5]:
# Renaming text column to content
df_ucd = df_ucd.rename(columns = {'text':'content'})

# Adding a 'state' column and setting its default value to 'CA'
df_ucd['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucd = df_ucd[['date','school','state','content']]

# Printing dataframe so we can view all entries
print(df_ucd.to_string())

         date school state                                            content
0   31-Jan-20  Davis    CA  Fight it like the flu\nUpdated 5 p.m. Jan. 31 ...
1    1-Feb-20  Davis    CA  Yolo County info page\nYolo County, in which t...
2    3-Feb-20  Davis    CA  Screening at airports\nUpdated 9 p.m. Feb. 3  ...
3    7-Feb-20  Davis    CA  No reports of coronavirus\nUpdated 11:30 a.m. ...
4   10-Feb-20  Davis    CA  UC Davis Live\nUpdated Feb. 10 Watch UC Davis ...
5   13-Feb-20  Davis    CA  Travel FAQ\nUpdated 11:30 p.m. Feb. 13  Global...
6   25-Feb-20  Davis    CA  New travel notices\nUpdated 6:55 p.m. Feb. 25 ...
7   27-Feb-20  Davis    CA  Level 3 countries off-limits\nUpdated 7 a.m. F...
8   27-Feb-20  Davis    CA  COVID-19 patient\nUpdated 7:50 a.m. Feb. 27  I...
9   27-Feb-20  Davis    CA  3 in isolation\nUpdated 3 p.m. Feb. 27  Chance...
10  27-Feb-20  Davis    CA  Media availability\nUpdated 4 p.m. Feb. 27  UC...
11  28-Feb-20  Davis    CA  Reject xenophobia\nUpdated 10 a.m. F

### Take a look at UC Irvine 

In [None]:
# Take a look at Irvine
df_uci['state'] = 'CA'
df_uci.head()

### Take a look at UC Los Angeles 

In [6]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucla['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucla = df_ucla[['date','school','state','word_count']]

# Printing dataframe so we can view all entries
print(df_ucla.to_string())

         date       school state  word_count
0   23-Jan-20  Los Angeles    CA         318
1   31-Jan-20  Los Angeles    CA         434
2    4-Feb-20  Los Angeles    CA         402
3   20-Feb-20  Los Angeles    CA         370
4   27-Feb-20  Los Angeles    CA         220
5    2-Mar-20  Los Angeles    CA         402
6    4-Mar-20  Los Angeles    CA         635
7    5-Mar-20  Los Angeles    CA         223
8    6-Mar-20  Los Angeles    CA         495
9    6-Mar-20  Los Angeles    CA         241
10   6-Mar-20  Los Angeles    CA          81
11   7-Mar-20  Los Angeles    CA         468
12  10-Mar-20  Los Angeles    CA         759
13  12-Mar-20  Los Angeles    CA         401
14  12-Mar-20  Los Angeles    CA        1260
15  13-Mar-20  Los Angeles    CA         203
16  13-Mar-20  Los Angeles    CA        1727
17  14-Mar-20  Los Angeles    CA        2111
18  16-Mar-20  Los Angeles    CA         335
19  16-Mar-20  Los Angeles    CA         443
20  17-Mar-20  Los Angeles    CA         850
21  17-Mar

### Take a look at UC Merced 

In [7]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucm['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucm = df_ucm[['date','school','state','content', 'word_count']]

# Printing dataframe so we can view all entries
print(df_ucm.to_string())

         date  school state                                            content  word_count
0    4-Feb-20  Merced    CA  Coronavirus Update\nFebruary 4, 2020\nInformat...         190
1   28-Feb-20  Merced    CA  UPDATE: What You Need to Know About COVID-19\n...         457
2    6-Mar-20  Merced    CA  COVID-19: Protect Yourself and Each Other\nMar...         468
3    8-Mar-20  Merced    CA  COVID-19: University Travel Guidance\nMarch 8,...         411
4    9-Mar-20  Merced    CA  COVID-19 Instructional Continuity\nMarch 9, 20...         489
5   10-Mar-20  Merced    CA  COVID-19: Update on Academic Operations\nMarch...         191
6   10-Mar-20  Merced    CA  COVID-19 and Research Continuity at UC Merced\...         650
7   10-Mar-20  Merced    CA  COVID-19 Live Webinar Tomorrow\nMarch 10, 2020...         309
8   10-Mar-20  Merced    CA  UPDATE: Moving Toward Remote Learning\nMarch 1...         374
9   11-Mar-20  Merced    CA  UPDATE: Moving Toward Remote Learning - Studen...         457

### Take a look at UC Riverside 

In [None]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucr['state'] = 'CA'
df_ucr.head()

### Take a look at UC San Diego 

In [8]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucsd['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucsd = df_ucsd[['date','school','state','word_count']]

# Printing dataframe so we can view all entries
print(df_ucsd.to_string()) # You will notice NaNs at the bottom of the data frame

         date school state word_count
0   22-Jan-20  Diego    CA        285
1   23-Jan-20  Diego    CA        507
2   27-Jan-20  Diego    CA        807
3   30-Jan-20  Diego    CA        280
4   30-Jan-20  Diego    CA        873
5   31-Jan-20  Diego    CA       1393
6    3-Feb-20  Diego    CA        799
7    5-Feb-20  Diego    CA       1485
8   14-Feb-20  Diego    CA        355
9   26-Feb-20  Diego    CA        410
10  26-Feb-20  Diego    CA        490
11  28-Feb-20  Diego    CA        694
12   4-Mar-20  Diego    CA        612
13   5-Mar-20  Diego    CA        566
14   8-Mar-20  Diego    CA        572
15   9-Mar-20  Diego    CA         59
16   9-Mar-20  Diego    CA        902
17  11-Mar-20  Diego    CA        264
18  12-Mar-20  Diego    CA        335
19  13-Mar-20  Diego    CA       1379
20  13-Mar-20  Diego    CA        348
21  13-Mar-20  Diego    CA        592
22  16-Mar-20  Diego    CA        252
23  16-Mar-20  Diego    CA        482
24  16-Mar-20  Diego    CA        843
25  17-Mar-2

In [9]:
# Dropping the empty rows at the bottom of df_ucsd
df_ucsd = df_ucsd.dropna()

# Now that the 4 rows with missing data on the bottom with the NaN values have been removed 
# We can convert word_count column back into an integer
convert_dict = {'word_count': int}
df_ucsd = df_ucsd.astype(convert_dict)

# Check out our updated df_ucsd
print(df_ucsd.to_string()) 

         date school state  word_count
0   22-Jan-20  Diego    CA         285
1   23-Jan-20  Diego    CA         507
2   27-Jan-20  Diego    CA         807
3   30-Jan-20  Diego    CA         280
4   30-Jan-20  Diego    CA         873
5   31-Jan-20  Diego    CA        1393
6    3-Feb-20  Diego    CA         799
7    5-Feb-20  Diego    CA        1485
8   14-Feb-20  Diego    CA         355
9   26-Feb-20  Diego    CA         410
10  26-Feb-20  Diego    CA         490
11  28-Feb-20  Diego    CA         694
12   4-Mar-20  Diego    CA         612
13   5-Mar-20  Diego    CA         566
14   8-Mar-20  Diego    CA         572
15   9-Mar-20  Diego    CA          59
16   9-Mar-20  Diego    CA         902
17  11-Mar-20  Diego    CA         264
18  12-Mar-20  Diego    CA         335
19  13-Mar-20  Diego    CA        1379
20  13-Mar-20  Diego    CA         348
21  13-Mar-20  Diego    CA         592
22  16-Mar-20  Diego    CA         252
23  16-Mar-20  Diego    CA         482
24  16-Mar-20  Diego    C

### Take a look at UC Santa Barbara 

In [10]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucsb['state'] = 'CA'
df_ucsb = df_ucsb.drop(['link'], axis =1)

# Reordering the columns into a more readable format
df_ucsb = df_ucsb[['date','school','state','content', 'word_count']]

# Printing dataframe so we can view all entries
print(df_ucsb.to_string())

         date         school state                                            content  word_count
0   14-May-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nI...        1546
1   18-Apr-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nW...         498
2   13-Apr-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nW...         395
3   07-Apr-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nI...         484
4   02-Apr-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nI...          74
5   31-Mar-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nW...        1262
6   26-Mar-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nT...         603
7   19-Mar-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nT...         380
8   18-Mar-20  Santa Barbara    CA  Dear Members of Our Campus Community,\r\n\r\nI...         452
9   17-Mar-20  Santa

### Take a look at Santa Cruz

In [11]:
# Adding a 'state' column and setting its default value to 'CA'
df_ucsc['state'] = 'CA'

# Reordering the columns into a more readable format
df_ucsc = df_ucsc[['date','school','state','content', 'word_count']]

# Printing dataframe so we can view all entries
print(df_ucsc.to_string())

         date      school state                                            content  word_count
0   24-Jan-20  Santa Cruz    CA  The UC Santa Cruz Student Health Center offers...         414
1   30-Jan-20  Santa Cruz    CA  UC Santa Cruz health officials are closely mon...         382
2   31-Jan-20  Santa Cruz    CA  Due to the dynamic situation with the coronavi...         338
3    4-Feb-20  Santa Cruz    CA  The UC Office of the President has directed th...        1276
4    4-Feb-20  Santa Cruz    CA  To: UC Santa Cruz Community\nFrom: Public Affa...         989
5    5-Feb-20  Santa Cruz    CA  UCSF is hosting a town hall today on the novel...         273
6    7-Feb-20  Santa Cruz    CA  The UC Santa Cruz Student Health Center shared...         700
7    7-Feb-20  Santa Cruz    CA  Students: How to get a flu vaccine\nFebruary 0...         359
8   10-Feb-20  Santa Cruz    CA  Campus community members can learn more about ...         178
9   11-Feb-20  Santa Cruz    CA  Seven confirmed c

### Combining all of our UC data frames

In [12]:
# concatenating the dataframes together
df = pd.concat([df_ucsc, df_ucm, df_ucd, df_ucb, df_ucsd, df_ucsb, df_ucla])
df = df[['date','school','state','content', 'word_count']]

# looking at the entire dataframe
print(df.to_string())

           date         school state                                            content  word_count
0     24-Jan-20     Santa Cruz    CA  The UC Santa Cruz Student Health Center offers...       414.0
1     30-Jan-20     Santa Cruz    CA  UC Santa Cruz health officials are closely mon...       382.0
2     31-Jan-20     Santa Cruz    CA  Due to the dynamic situation with the coronavi...       338.0
3      4-Feb-20     Santa Cruz    CA  The UC Office of the President has directed th...      1276.0
4      4-Feb-20     Santa Cruz    CA  To: UC Santa Cruz Community\nFrom: Public Affa...       989.0
5      5-Feb-20     Santa Cruz    CA  UCSF is hosting a town hall today on the novel...       273.0
6      7-Feb-20     Santa Cruz    CA  The UC Santa Cruz Student Health Center shared...       700.0
7      7-Feb-20     Santa Cruz    CA  Students: How to get a flu vaccine\nFebruary 0...       359.0
8     10-Feb-20     Santa Cruz    CA  Campus community members can learn more about ...       178.0


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


### Reading in Texas school csv files

In [13]:
# Read in Texas State University (UNICODEDECODE ERRROR!!!) GGEZ
#df_tsu = pd.read_csv(os.path.join('Datasets', 'Texas_State_University.csv'), 
#                     dtype={'date': 'str', 'word_count':int, 'school':'str'})

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 688: invalid continuation byte