In [1]:
import os
import glob
import re
import pandas as pd
import datetime as dt
import random
import numpy as np


# Note: I put lmafit inside src/ to make things easier for right now
from lmafit import lmafit_mc_adp

### Generate testing matrix

In [2]:
# Create an index with dates between september 8 (start of NYT collection) and today
start_day = dt.datetime(2020, 9, 8)
indexer = {}
c = 0
while start_day < dt.datetime.today():
    indexer[c] = start_day
    start_day = start_day + dt.timedelta(days = 1)
    c += 1

In [3]:
# Create a matrix of 30 tester schools with random case numbers (always increasing and non-zero)
# NOTE: I used the exponential random distribution for changes between days

# Here Im creating different "spread rates" for each fake school
spread_rate = []
for i in range(30):
    spread_rate.append(random.randrange(1,30))
    
matr = [[] for i in range(len(indexer))]

# for each date
for i in range(len(indexer)):    
    # for each school
    for j in range(30):
        daily_spread = random.expovariate(1/spread_rate[j])
        if i == 0: #CHANGE
            # Day 0 gets random starting number
            matr[i].append(random.randrange(1,50))
        else:
            # All other dates must be >= than the day before
            last_val = matr[i-1][j]
            matr[i].append(random.randrange(last_val, int(last_val + daily_spread + 1)))

print(np.linalg.matrix_rank(matr))
print(pd.DataFrame(matr))

30
      0     1    2     3     4    5    6     7     8    9   ...    20    21  \
0     33    28   43    15    28   17   24    13    15   11  ...     3    24   
1     39    32   43    17    34   18   24    31    65   23  ...    39    24   
2     39    39   43    38    47   24   62    31    65   32  ...    41    36   
3     39    99   47    40    49   26   92    31    76   34  ...   143    36   
4     40   100   52    40    51   33   93    52    87   38  ...   153    45   
..   ...   ...  ...   ...   ...  ...  ...   ...   ...  ...  ...   ...   ...   
115  519  2044  272  1030  1875  610  904   980  1396  847  ...  1559  1232   
116  523  2055  273  1030  1875  613  905   980  1400  852  ...  1560  1242   
117  523  2080  274  1030  1941  621  940   999  1412  858  ...  1561  1249   
118  536  2112  279  1035  1944  641  957  1011  1420  865  ...  1562  1269   
119  544  2120  280  1066  1958  662  958  1014  1421  865  ...  1577  1269   

      22    23    24   25   26    27   28    29 

### Remove some rows (dates) to simulate NYT 

In [4]:
# dates NYT was updated
nyt_dates = ['2020-09-08', '2020-09-11', '2020-09-26', '2020-10-09', '2020-10-24', '2020-11-06', '2020-11-20', '2020-12-12']
drop_matr = matr.copy()

for i in range(len(drop_matr)):
    if dt.datetime.strftime(indexer[i], '%Y-%m-%d') not in nyt_dates:
        drop_matr[i] = [0]*30
        
print('rank ' + str(np.linalg.matrix_rank(drop_matr)) + ' matrix')
print(pd.DataFrame(drop_matr))
len(drop_matr)

rank 8 matrix
     0   1   2   3   4   5   6   7   8   9   ...   20  21  22  23  24  25  26  \
0    33  28  43  15  28  17  24  13  15  11  ...    3  24   4  27  13  35  37   
1     0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   
2     0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   
3    39  99  47  40  49  26  92  31  76  34  ...  143  36  10  91  66  47  46   
4     0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   
..   ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ..  ..  ..  ..  ..  ..   
115   0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   
116   0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   
117   0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   
118   0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   
119   0   0   0   0   0   0   0   0   0   0  ...    0   0   0   0   0   0   0   

     27  28  

120

In [5]:
# make the arrays needed for lmafit

known_seq = [[],[]]
known_values = []
for i in range(len(drop_matr)):
    date_ind = dt.datetime.strftime(indexer[i], '%Y-%m-%d')
    if date_ind in nyt_dates:
        for j in range(30):
            known_seq[0].append(i)
            known_seq[1].append(j)
            known_values.append(drop_matr[i][j])

known_indices = [tuple(known_seq[0]), tuple(known_seq[1])]

In [6]:
X,Y,out = lmafit_mc_adp(len(drop_matr),30,8,known_indices,known_values)

complete = np.dot(X,Y)
print(pd.DataFrame(complete))

  Z[Known] = data


       0     1     2     3     4     5     6     7     8     9   ...     20  \
0    33.0  28.0  43.0  15.0  28.0  17.0  24.0  13.0  15.0  11.0  ...    3.0   
1     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
2     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
3    39.0  99.0  47.0  40.0  49.0  26.0  92.0  31.0  76.0  34.0  ...  143.0   
4     0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
..    ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...    ...   
115   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
116   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
117   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
118   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
119   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   

       21    22    23    24    25    26    27    28

### Try this with the actual data

In [7]:
# import all the data
cwd = os.getcwd()
par = os.path.join(cwd, os.pardir)
par = os.path.abspath(par)
parpar = os.path.join(par, os.pardir)
parpar = os.path.abspath(parpar)
nyt_datapath = os.path.join(par, 'UniversityCases', '')
big10_datapath = os.path.join(parpar, 'college-covid19-dataset', 'data', '')

fnames = sorted(glob.glob(nyt_datapath+'*.csv'))
frames = []
for f in fnames:
    #m = re.search(r'[^0-9]*([0-9][0-9_]+)[^0-9]*', f)
    df = pd.read_csv(f)
    
    #df['Date'] = pd.to_datetime(m.group(1), format='%m_%d_%y')
    df['Date'] = pd.to_datetime(df['Date'], format='%m-%d-%Y')
    
    df.drop(['Unnamed: 0'], axis = 1, inplace=True)
    df['Cases'] = df['Cases'].apply(lambda x: x.replace(',', '')).astype('int')
    frames.append(df)
    
nyt_df = pd.concat(frames)

big10_df = pd.read_csv(os.path.join(big10_datapath, 'daily.csv'))
old_cols = big10_df.columns.values.copy()
old_cols[0] = 'School'
big10_df.columns = old_cols
big10_df['Date'] = pd.to_datetime(big10_df['Date'],format='%Y-%m-%d')

In [8]:
nyt_df = nyt_df.drop_duplicates(subset = ['School','Cases'])

### Build the frame by removing duplicates and hiding 4 of the Big 10 schools for testing

In [9]:
# find list of schools
schools = pd.unique(nyt_df.School)
# NOTE: something I noticed is that some of the schools have less data points than the others. Not sure why.

name_translator = {'University of Illinois Urbana-Champaign':'Illinois',
                    'Indiana University Bloomington':'Indiana', 
                    'University of Iowa':'Iowa',
                    'University of Maryland, College Park':'Maryland',
                    'Michigan State University':'Michigan State',
                    'University of Minnesota Twin Cities':'Minnesota',
                    'Northwestern University':'Northwestern',
                    'Ohio State University':'Ohio State',
                    'Penn State University':'Penn State',
                    'University of Wisconsin-Madison':'UW-Madison'}

hidden = {'University of Michigan':'Michigan','University of Nebraska-Lincoln':'Nebraska', 
          'Purdue University':'Purdue','Rutgers University':'Rutgers'}

In [10]:
# This takes a while to run, I'll need to write it better

strindex = [dt.datetime.strftime(indexer[i], '%Y-%m-%d') for i in indexer]
data_dict = {}
for i in schools:
    cases = []
    if i in hidden.keys():
        cases = [0] * len(strindex)
    elif i in name_translator.keys():
        school = big10_df.loc[big10_df.School == name_translator[i]]
        for j in strindex:
            if not school.loc[school.Date == j].empty:
                cases.append(school.loc[school.Date == j].Confirmed.iloc[0])
            else:
                cases.append(0)
    else:
        school = nyt_df.loc[nyt_df.School == i]
        for j in strindex:
            if not school.loc[school.Date == j].empty:
                cases.append(school.loc[school.Date == j].Cases.iloc[0])
            else:
                cases.append(0)
    data_dict[i] = cases

In [15]:
incomplete_matr = pd.DataFrame.from_dict(data_dict)
arr = incomplete_matr.values.tolist()
print('rank ' + str(np.linalg.matrix_rank(arr)))

# make the arrays needed for lmafit

known_seq = [[],[]]
known_values = []
for i in range(len(arr)):
    for j in range(len(arr[0])):
        if arr[i][j] != 0:
            known_seq[0].append(i)
            known_seq[1].append(j)
            known_values.append(arr[i][j])

known_indices = [tuple(known_seq[0]), tuple(known_seq[1])]
known_values = [tuple(known_values)]

rank 22


In [16]:
X,Y,out = lmafit_mc_adp(len(arr),len(arr[0]),22,known_indices,known_values)

complete = np.dot(X,Y)
print(pd.DataFrame(complete))

  Z[Known] = data
  Res = data - Z[Known]
  Z[Known] = data + alf*Res


          0         1          2            3           4         5     \
0     8.000000  0.011449   2.000000  1074.000000    2.000000  0.022898   
1    -0.658441 -0.031028   0.796678    -9.427792   -0.527508 -0.062056   
2     5.663128  0.259158   0.434950   -38.160511    3.954616  0.518316   
3    17.000000 -0.081460   3.340867  1672.000000    5.851227 -0.162920   
4     0.443440  0.022420  -0.128539     2.363143   -0.392540  0.044840   
..         ...       ...        ...          ...         ...       ...   
115  41.000000  2.000000  35.000000  2134.000000  140.000000  4.000000   
116   0.000000  0.000000   0.000000     0.000000    0.000000  0.000000   
117   0.000000  0.000000   0.000000     0.000000    0.000000  0.000000   
118   0.000000  0.000000   0.000000     0.000000    0.000000  0.000000   
119   0.000000  0.000000   0.000000     0.000000    0.000000  0.000000   

           6          7          8          9     ...      1862      1863  \
0      1.020935  -0.046123   0.035