In [1]:
import os
import glob
import re
import pandas as pd
import datetime as dt
import random
import numpy as np
from sklearn.datasets import make_regression
from sklearn.isotonic import IsotonicRegression
import copy

# Note: I put lmafit inside src/ to make things easier for right now
from lmafit import lmafit_mc_adp

In [2]:
# import all the data
cwd = os.getcwd()
par = os.path.join(cwd, os.pardir)
par = os.path.abspath(par)
parpar = os.path.join(par, os.pardir)
parpar = os.path.abspath(parpar)
nyt_datapath = os.path.join(par, 'UniversityCases', '')
big10_datapath = os.path.join(parpar, 'college-covid19-dataset', 'data', '')

fnames = sorted(glob.glob(nyt_datapath+'*.csv'))
frames = []
for f in fnames:
    #m = re.search(r'[^0-9]*([0-9][0-9_]+)[^0-9]*', f)
    df = pd.read_csv(f)
    
    #df['Date'] = pd.to_datetime(m.group(1), format='%m_%d_%y')
    df['Date'] = pd.to_datetime(df['Date'], format='%m-%d-%Y')
    
    df.drop(['Unnamed: 0'], axis = 1, inplace=True)
    df['Cases'] = df['Cases'].apply(lambda x: x.replace(',', '')).astype('int')
    frames.append(df)
    
nyt_df = pd.concat(frames)

big10_df = pd.read_csv(os.path.join(big10_datapath, 'daily.csv'))
old_cols = big10_df.columns.values.copy()
old_cols[0] = 'School'
big10_df.columns = old_cols
big10_df['Date'] = pd.to_datetime(big10_df['Date'],format='%Y-%m-%d')

In [3]:
nyt_df = nyt_df.sort_values('Date')
big10_df = big10_df.sort_values('Date')

In [4]:
nyt_df = nyt_df.drop_duplicates(subset = ['School','Cases'])

### Combine NYT and Big 10

In [5]:
# find list of schools
schools = list(pd.unique(nyt_df.School))
# NOTE: something I noticed is that some of the schools have less data points than the others. Not sure why.

# Create an index with dates between september 8 (start of NYT collection) and today
start_day = dt.datetime(2020, 9, 8)
indexer = {}
c = 0
while start_day < dt.datetime.today():
    indexer[c] = start_day
    start_day = start_day + dt.timedelta(days = 1)
    c += 1

name_translator = {'University of Illinois Urbana-Champaign':'Illinois',
                    'Indiana University Bloomington':'Indiana', 
                    'University of Iowa':'Iowa',
                    'University of Maryland, College Park':'Maryland',
                    'Michigan State University':'Michigan State',
                    'University of Minnesota Twin Cities':'Minnesota',
                    'Northwestern University':'Northwestern',
                    'Ohio State University':'Ohio State',
                    'Penn State University':'Penn State',
                    'University of Wisconsin-Madison':'UW-Madison',
                    'University of Michigan':'Michigan',
                    'University of Nebraska-Lincoln':'Nebraska', 
                    'Purdue University':'Purdue',
                    'Rutgers University':'Rutgers'}

for i in name_translator.keys():
    schools.remove(i)

In [6]:
strindex = [dt.datetime.strftime(indexer[i], '%Y-%m-%d') for i in indexer]
date_index = {indexer[j]:j for j in range(len(indexer))}
data_dict = {}

# Add all the big 10 schools
for i in name_translator.values():
    cases = [0 for i in range(len(indexer))]
    school = big10_df.loc[big10_df.School == i]
    for j in school.Date:
        if j >= dt.datetime(2020,9,8):
            to_index = date_index[j]
            cases[to_index] = school.loc[school.Date == j].Confirmed.iloc[0]
        
    data_dict[i] = cases


# Add all the NYT
for i in schools:
    cases = [0 for i in range(len(indexer))]
    school = nyt_df.loc[nyt_df.School == i]
    for j in school.Date:
        to_index = date_index[j]
        cases[to_index] = school.loc[school.Date == j].Cases.iloc[0]
        
    data_dict[i] = cases
    

### Drop all Zero columns

In [7]:
incomplete_matr = pd.DataFrame.from_dict(data_dict)
no_zero = incomplete_matr.loc[(incomplete_matr!=0).any(axis=1)]
dates_used = no_zero.index
schools_used = no_zero.columns

# Transpose to make things easier
arr = no_zero.T.values.tolist()

### Randomly hide some of the dates/schools

In [8]:
def hide_date(matrix, percent):
    # returns a copy of the original matrix with x percent of the dates hidden
    mat = copy.deepcopy(matrix)
    num_hide = int(len(mat[0])*percent)

    date_ind = range(len(mat[0]))
    to_hide = random.sample(date_ind, num_hide)
    to_hide.sort(reverse = True)
    
    for s in mat:
        for t in to_hide:
            s.pop(t)
    
    return mat

def hide_school(matrix, percent):
    # returns a copy of the original matrix with x percent of the schools hidden
    mat = copy.deepcopy(matrix)
    num_hide = int(len(mat)*percent)
    
    school_ind = range(len(mat))
    to_hide = random.sample(school_ind, num_hide)
    to_hide.sort(reverse = True)
    
    for t in to_hide:
        mat.pop(t)
    
    return mat
    

In [9]:
#arr = hide_date(arr, 0.5)
#arr = hide_school(arr, 0.2)

### Test if the matrix is row increasing 

In [10]:
def is_inc(matrix, printy=True):
    # matrix is a 2d array
    # matrix[0] is the first school
    # NOTE: That is the transpose of what I'm using elsewhere
    #       just to make things easier
    
    non_inc = {}
    for i in range(len(matrix)):
        last = matrix[i][0]
        spots = []
        for j in range(len(matrix[0])):
            if matrix[i][j] != 0 and not np.isnan(matrix[i][j]):
                if matrix[i][j] < last:
                    spots.append(j)
                last = matrix[i][j]
        
        if len(spots) != 0:
            non_inc[i] = spots
        
    if printy == True:
        print(str(len(non_inc)) + " schools are non increasing in at least one spot")
    return non_inc

In [11]:
is_inc(arr)
print('rank ' + str(np.linalg.matrix_rank(arr)))


111 schools are non increasing in at least one spot
rank 27


In [23]:
incr = iso(arr)

### Run through lmafit

In [24]:
# make the arrays needed for lmafit

known_seq = [[],[]]
known_values = []
for i in range(len(incr)):
    for j in range(len(incr[0])):
        if arr[i][j] != 0:
            known_seq[0].append(i)
            known_seq[1].append(j)
            known_values.append(incr[i][j])

known_indices = [tuple(known_seq[0]), tuple(known_seq[1])]
known_values = [tuple(known_values)]

In [28]:
X,Y,out = lmafit_mc_adp(len(incr),len(incr[0]),14,known_indices,known_values)

complete = np.dot(X,Y)
print(pd.DataFrame(complete).T)

  Z[Known] = data


            0            1            2           3            4     \
0    1846.074430  2019.545951  1747.557995  144.017057  1354.028678   
1    1888.048803  2018.692030  1758.848369  143.609443  1345.812773   
2    1921.879175  2017.738451  1834.312103  147.530537  1349.654074   
3    1952.904719  2025.076127  1822.455897  135.537587  1348.492512   
4    1967.378427  2018.735761  1869.967342  169.068657  1348.439185   
..           ...          ...          ...         ...          ...   
99   4500.834080   -37.833856  3126.228853  842.798885  3710.219712   
100  4511.777899   293.584568  2564.334689  856.586596  3708.881526   
101  4517.952251   -31.164256    32.919671  870.604583  3710.051710   
102  4525.881192   -63.125524  -424.816009  872.045317  3711.002260   
103  -909.337599   248.882101   519.210197  879.796749  3710.557629   

           5            6            7            8            9     ...  \
0    134.071163    66.923902  1889.308483  1054.426882   641.733219  ..

### Test for row increasing and do isotonic regression if not

In [29]:
non_inc = is_inc(complete)

1872 schools are non increasing in at least one spot


In [27]:
k = is_inc(incr)

0 schools are non increasing in at least one spot


In [15]:
def iso(matrix):
    # performs isotonic regression for every school
    tonic = copy.deepcopy(matrix) # returns a new isotonic matrix
    
    # dat dict tells me where things arent increasing
    dat_dict = is_inc(matrix, False)
    
    for i in dat_dict.keys():
        to_predict = dat_dict[i]
        leng = len(tonic[0]) - len(to_predict)
        initial_vals = list(tonic[i].copy())
        X = list(range(len(tonic[i])))

        # Use the increasing values to fit the model and then predict what the decreasing ones should be 
        iso = IsotonicRegression().fit(X,initial_vals)
        predictions = iso.predict(range(len(tonic[i])))
        
        # put everything back:
        tonic[i] = predictions
    
    return(tonic)

In [16]:
iso_result = iso(complete)

In [17]:
is_inc(iso_result)

0 schools are non increasing in at least one spot


{}

In [19]:
fin = pd.DataFrame(iso_result)
fin.index = incomplete_matr.columns
print(fin.T)

        Illinois      Indiana         Iowa  Maryland  Michigan State  \
0    1841.000000  2020.000000  1755.000000     140.0          1348.0   
1    1888.000000  2020.000000  1755.000000     142.0          1348.0   
2    1922.000000  2020.000000  1831.000000     170.0          1348.0   
3    1957.000000  2020.000000  1831.000000     175.0          1348.0   
4    1967.000000  2020.000000  1868.000000     175.0          1348.0   
..           ...          ...          ...       ...             ...   
99   4185.332496  3458.714294  2675.000598     844.0          3710.0   
100  4185.332496  3458.714294  2675.000598     847.0          3710.0   
101  4185.332496  3458.714294  2675.000598     873.0          3710.0   
102  4185.332496  3458.714294  2675.000598     878.0          3710.0   
103  4185.332496  3458.714294  2675.000598     886.0          3710.0   

      Minnesota  Northwestern   Ohio State   Penn State   UW-Madison  ...  \
0    128.000000     73.000000  1888.000000  1053.000000   