In [1]:
import os
import pandas as pd
import re

In [2]:
files = os.listdir('../Datasets')
allSchools = pd.DataFrame(columns = ['school', 'date', 'content'])
for i in files:
    if '.csv' in i:
        try:
            filepath = os.path.join('..','Datasets', i)
            frame = pd.read_csv(filepath)[['school', 'date', 'content']] 
            allSchools = pd.concat([allSchools, frame])
        except:
            print('Error with: '.ljust(30) + i)
            try:
                filepath = os.path.join('..','Datasets', i)
                frame = pd.read_csv(filepath, encoding='latin-1')[['school', 'date', 'content']]  
                allSchools = pd.concat([allSchools, frame])
                print(("Encoding to Latin **DID** Work for: ".ljust(30) + i))
            except:
                print(("Encoding DID **NOT** Work for: ".ljust(30) + i))

Error with:                   Florida_State_Univeristy.csv
Encoding to Latin **DID** Work for: Florida_State_Univeristy.csv
Error with:                   UC_Irvine.csv
Encoding to Latin **DID** Work for: UC_Irvine.csv
Error with:                   UC_Riverside.csv
Encoding to Latin **DID** Work for: UC_Riverside.csv


In [3]:
allSchools = allSchools.reset_index(drop = True)
# pd.set_option('display.max_rows', 50)

### Drop NA columns *only where we had videos*

In [4]:
allSchools = allSchools[~allSchools['content'].isna()]

## Adding Quantitative Data
- Word count
    - Replaced \n with space, 
    - then replaced any spaces in groups of 2 or more with just 1 space
    - then split the content on space, and find its length
- Month number (for binning)
    - Had to clean the year since it some didn't include the year, and some did '20' while others did '2020'
    - Had to make sure all the years were abbreviated (converted the only case where it wasn't

## Adding Categorical Data Data
- State
    - Made lists of the schools from each state
    - Added column for each entry determining its state from these lists

In [5]:
def word_count(string):
    cont = re.sub('\n{1,}', ' ', string)
    cont = re.sub(' {2,}', ' ', cont)
    return len(cont.split())

In [6]:
allSchools = allSchools.assign(word_count = allSchools['content'].apply(word_count))
allSchools.head(3)

Unnamed: 0,school,date,content,word_count
0,Florida State University,28-Feb-20,"Dear FSU Family,\n\nFlorida State University c...",615
1,Florida State University,28-Feb-20,This message to all faculty and staff has been...,175
2,Florida State University,2-Mar-20,The spring semester programs at Florida State ...,150
3,Florida State University,3-Mar-20,"Dear Faculty, Staff and Students,\n\nI want to...",256
4,Florida State University,3-Mar-20,Florida State University is taking aggressive ...,448


In [7]:
cal_schools = ["Berkeley","Davis","Irvine","Los Angeles","Merced","Riverside","Santa Barbara", "Santa Cruz", "Diego"]
tex_schools = ["Texas State University",  "Austin","Houston","University of North Texas", "T_A&M_U"]
fl_schools = ["Florida State University", "University of Central Florida", "Florida", "Miami","USF"]

In [8]:
def give_state(string):
    if string in cal_schools:
        return 'cal'
    elif string in tex_schools:
        return 'tex'
    elif string in fl_schools:
        return 'fl'
    else:
        print('ERROR')
        return 'ERROR'

In [9]:
allSchools = allSchools.assign(state = allSchools['school'].apply(give_state))
allSchools.head(3)

Unnamed: 0,school,date,content,word_count,state
0,Florida State University,28-Feb-20,"Dear FSU Family,\n\nFlorida State University c...",615,fl
1,Florida State University,28-Feb-20,This message to all faculty and staff has been...,175,fl
2,Florida State University,2-Mar-20,The spring semester programs at Florida State ...,150,fl
3,Florida State University,3-Mar-20,"Dear Faculty, Staff and Students,\n\nI want to...",256,fl
4,Florida State University,3-Mar-20,Florida State University is taking aggressive ...,448,fl


In [10]:
def fix_year(string):
    string = string.replace('April', 'Apr') # the only case where it wasn't abbreviated
    if string[-4:] != '2020': #if it wasn't properly formatted
        if string[-2:] == '20': # still had some year, but not all
            return string+'20' #only add the last two digits
        else: # had no year data at all
            return string+'-2020' # add the entire year
    else:
        return string

In [11]:
# fixing the date format
allSchools = allSchools.assign(date = allSchools['date'].apply(fix_year))
# extracting the month from each
allSchools = allSchools.assign(month = pd.to_datetime(allSchools['date']).apply(lambda x: x.month))

In [16]:
allSchools['month'].unique()

array([2, 3, 4, 5, 1], dtype=int64)

In [18]:
allSchools = allSchools.reset_index(drop=True)

In [20]:
allSchools.to_csv('allSchools.csv', index = False)