# Data Cleaning for Hubei Data

In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import os.path as osp
import os

In [30]:
cur_dir = osp.dirname(osp.abspath(osp.dirname("__file__")))
data_path = osp.join(cur_dir,'Data')
cov_path = osp.join(data_path,'covid.csv')

'/Users/joshuamorgan/Documents/CourseWork/Spring20/Regression/Regression_Final_Project'

In [33]:
def analyze(data):
    
    ## initialize predictor and predicted variable
    xcol = ['age', 'latitude', 'longitude', 'sex_female', 'country_Algeria', 'country_Australia', 'country_Brazil', 'country_France', 'country_Gambia', 'country_Germany', 'country_Ghana', 'country_Guyana', 'country_Italy', 'country_Japan', 'country_Malaysia', 'country_Nepal', 'country_Niger', 'country_Philippines', 'country_Romania', 'country_San Marino', 'country_Singapore', 'country_South Korea', 'country_Switzerland', 'country_Thailand', 'country_United States', 'country_Vietnam', 'country_Zimbabwe']
    x = data[xcol]
    y = data['outcome_D']
    
    ## split into testing and training data
    xtrain, xtest, ytrain, ytest  = train_test_split(x,y,test_size=.25)
    
    ## build logisitic regression from training data
    logit = LogisticRegression()
    logit.fit(xtrain,ytrain)
    
    ## evaluate on testing data
    ypred = logit.predict(xtest)
    print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logit.score(xtest, ytest)))
    
    ## confusion matrix
    cm = confusion_matrix(ytest, ypred)
    print(cm)
    
    ## get coefficients
    coeff = pd.DataFrame({"Feature":x.columns,"Coefficients":np.transpose(logit.coef_[0])})
    print(coeff)
    
    return 

In [37]:
def preprocess(data):
    
    ## restrict starting columns
    new_cols = ['age','sex','country','latitude','longitude','outcome']
    df = data[new_cols]
    df = df.dropna()

    ## clean age column
    counts = []
    for i,age in enumerate(df['age']):
        try:
            int(age)
            counts.append(age)
        except ValueError:
            continue
            
    df = df[(df['age'].isin(counts))]
    df['age'] = df['age'].astype(int)
    
    ## clean outcome column
    df['outcome'] = df['outcome'].apply(lambda x: adjust_outcome(x))
    
    ## encode categorical variables
    cat = ['sex','country','outcome']
    df = pd.get_dummies(df,columns=cat)
    
    ## drop reference cases
    ## sex reference case: male, 291 cases
    ## country refernce case: China, 136 cases
    ## outcome reference case: Survive episode, 289 cases
    rcs = ['sex_male','country_China']
    df = df.drop('sex_male',1)
    df = df.drop('country_China',1)
    df = df.drop('outcome_S',1)
   
    ## clean out in progress cases
    df = df[df['outcome_IP'] != 1]
    df = df.drop('outcome_IP',1)
    
    return df

In [38]:
def adjust_outcome(outcome):
    dead = ['death','died','Dead','Deceased','Died']
    IP = ['stable','Under treatment','Stable']
    lived = ['discharged','discharge','released from quarantine',
             'Discharged','recovered','Alive','Recovered']
    
    if outcome in dead:
        return 'D'
    if outcome in IP:
        return 'IP'
    if outcome in lived:
        return 'S'

In [39]:

def main(path):
    data = pd.read_csv(path)
    data = preprocess(data)
    data.to_csv('cleaned_covid.csv')
    analyze(data)

main(cov_path)

Accuracy of logistic regression classifier on test set: 0.90
[[62  9]
 [ 1 32]]
                  Feature  Coefficients
0                     age      0.071126
1                latitude      0.019066
2               longitude     -0.008999
3              sex_female     -0.567296
4         country_Algeria      0.311353
5       country_Australia     -0.433529
6          country_Brazil     -0.980979
7          country_France     -0.501097
8          country_Gambia      0.121299
9         country_Germany      0.000000
10          country_Ghana      0.000000
11         country_Guyana      0.356927
12          country_Italy     -0.119891
13          country_Japan     -0.459835
14       country_Malaysia     -0.579448
15          country_Nepal     -0.280341
16          country_Niger     -1.631355
17    country_Philippines      0.569287
18        country_Romania     -0.855956
19     country_San Marino      0.029325
20      country_Singapore     -2.403465
21    country_South Korea     -1.053825


