In [214]:
from scipy import stats
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score,precision_score,recall_score



class ModelSummary:
    """ This class extracts a summary of the model
    
    Methods
    -------
    get_se()
        computes standard error
    get_ci(SE_est)
        computes confidence intervals
    get_pvals()
        computes p-values
    get_summary(name=None)
        prints the summary of the model
    """
    
    def __init__(self, clf, X, y):
        """
        Parameters
        ----------
        clf: class
            the classifier object model
        X: pandas Dataframe
            matrix of predictors
        y: numpy array
            matrix of variable
        """
        self.clf = clf
        self.X = X
        self.y = y
        pass
    
    def get_se(self):
        # from here https://stats.stackexchange.com/questions/89484/how-to-compute-the-standard-errors-of-a-logistic-regressions-coefficients
        predProbs = self.clf.predict_proba(self.X)
        X_design = np.hstack([np.ones((self.X.shape[0], 1)), self.X])
        V = np.diagflat(np.product(predProbs, axis=1))
        covLogit = np.linalg.inv(np.dot(np.dot(X_design.T, V), X_design))
        return np.sqrt(np.diag(covLogit))

    def get_ci(self, SE_est):
        """
        Parameters
        ----------
        SE_est: numpy array
            matrix of standard error estimations
        """
        p = 0.975
        df = len(self.X) - 2
        crit_t_value = stats.t.ppf(p, df)
        coefs = np.concatenate([self.clf.intercept_, self.clf.coef_[0]])
        upper = coefs + (crit_t_value * SE_est)
        lower = coefs - (crit_t_value * SE_est)
        cis = np.zeros((len(coefs), 2))
        cis[:,0] = lower
        cis[:,1] = upper
        return cis
    
    def get_pvals(self):
        # from here https://stackoverflow.com/questions/25122999/scikit-learn-how-to-check-coefficients-significance
        p = self.clf.predict_proba(self.X)
        n = len(p)
        m = len(self.clf.coef_[0]) + 1
        coefs = np.concatenate([self.clf.intercept_, self.clf.coef_[0]])
        se = self.get_se()
        t =  coefs/se  
        p = (1 - stats.norm.cdf(abs(t))) * 2
        return p
    
    def get_summary(self, names=None):
        ses = self.get_se()
        cis = self.get_ci(ses)
        lower = cis[:, 0]
        upper = cis[:, 1]
        pvals = self.get_pvals()
        coefs = np.concatenate([self.clf.intercept_, self.clf.coef_[0]])
        data = []
        for i in range(len(coefs)):
            currlist = []
            currlist.append(np.round(coefs[i], 3))
            currlist.append(np.round(ses[i], 3))
            currlist.append(np.round(pvals[i], 3))
            currlist.append(np.round(lower[i], 3))
            currlist.append(np.round(upper[i], 3))
            data.append(currlist)
        cols = ['coefficient', 'std', 'p-value', '[0.025', '0.975]']
        sumdf = pd.DataFrame(columns=cols, data=data)
        if names is not None:
            new_names = ['intercept']*(len(names) + 1)
            new_names[1:] = [i for i in names]
            sumdf.index = new_names
        else:
            try:
                names = list(self.X.columns)
                new_names = ['intercept']*(len(names) + 1)
                new_names[1:] = [i for i in names]
                sumdf.index = new_names
            except:
                pass
        print(sumdf)
        acc = (accuracy_score(self.y, self.clf.predict(self.X)) + recall_score(self.y, self.clf.predict(self.X)))/2
        #acc = accuracy_score(self.y, self.clf.predict(self.X))
        confmat = confusion_matrix(self.y, self.clf.predict(self.X))
        print('-'*60)
        print('Confusion Matrix (total:{}) \t Accuracy: \t  {}'.format(len(self.X),np.round(acc, 3)))
        print('  TP: {} | FN: {}'.format(confmat[1][1],confmat[1][0]))
        print('  FP: {} | TN: {}'.format(confmat[0][1],confmat[0][0]))





In [198]:
#RANDOM SELECT




import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd



validation = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('TrainBalanced.csv')


validation = validation.drop(['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian'],1)
test = test.drop(['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian'],1)


 
X_train_unstandard = train.drop(columns='ADeath')
X_train=(X_train_unstandard-X_train_unstandard.mean())/X_train_unstandard.std()
y_train = train['ADeath']

X_val_unstandard = validation.drop(columns='ADeath')
X_val= (X_val_unstandard - X_val_unstandard.mean())/X_val_unstandard.std()
y_val = validation['ADeath']

X_test_unstandard = test.drop(columns='ADeath')
X_test= (X_test_unstandard - X_test_unstandard.mean())/X_test_unstandard.std()
y_test = test['ADeath']


def my_rwd_selector(X_train, y_train, X_val, y_val):
    cols = list(X_train.columns)
    total = cols.copy()
    best_val_acc = 0
    selected_vars = []
    best = []
    p = 0
    for i in range(len(total)):
        random_vars = []
        random_vars.append(total[i])
        #print(random_vars)
        while len(cols) > 0:
            candidate = None
            for i in range(len(cols)):
                current_vars = selected_vars.copy()
                if cols[i] == random_vars:
                    break
                else:
                    current_vars.append(cols[i])
                    if len(current_vars) == 1:
                        new_X_train = X_train[current_vars].values.reshape(-1, 1)
                        new_X_val = X_val[current_vars].values.reshape(-1, 1)
                    else:
                        new_X_train = X_train[current_vars]                
                        new_X_val = X_val[current_vars]





                    mod = LogisticRegression(C=1e9).fit(new_X_train, y_train)
                    val_rec = recall_score(y_val, mod.predict(new_X_val))
                    val_accu = accuracy_score(y_val, mod.predict(new_X_val))
                    val_acc = (val_rec + val_accu)/2
                    if val_acc - best_val_acc > 0.002:
                        candidate = cols[i]
                        best_val_acc = val_acc
            if candidate is not None:
                selected_vars.append(candidate)
                cols.remove(candidate)
            else:
                break
            #print('Columns in current model: {}'.format(', '.join(selected_vars)))
            #print('Best validation accuracy is {}'.format(np.round(best_val_acc, 3)))
        if p < best_val_acc:
            p = best_val_acc
            best = selected_vars
               
    return best
 
selected_columns = my_rwd_selector(X_train, y_train, X_val, y_val)
print(selected_columns)





model = LogisticRegression(C=1e9).fit(X_train[selected_columns], y_train)





y_train_predicted = model.predict(X_train[selected_columns])
y_val_predicted = model.predict(X_val[selected_columns])
y_test_predicted = model.predict(X_test[selected_columns])


training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))

















['Native', 'OtherTransp']
Training recall is:    0.8807439824945296
Validation recall is:  0.8950276243093923
Validation accuracy is:  0.6611295681063123
precision accuracy is:  0.6612244897959184
training weighted accuracy is:  0.7270240700218819
validation weighted accuracy is:  0.7780785962078522
test weighted accuracy is:    0.6093548387096774


In [211]:
#RANDOM SELECT UNBALANCED

import warnings
warnings.filterwarnings("ignore")

 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd

 

df = pd.read_csv('finalcovid.csv')

 

train, other = train_test_split(df, test_size=0.2, random_state=0)
test, validation = train_test_split(other, test_size=0.5, random_state=0)

 

X_train = train.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_train = train['ADeath']
X_val = validation.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_val = validation['ADeath']
X_test = test.drop(columns=['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_test = test['ADeath']

 

X_train=(X_train-X_train.mean())/X_train.std()
X_val = (X_val-X_val.mean())/X_val.std()
X_test = (X_test-X_test.mean())/X_test.std()


def my_rwd_selector(X_train, y_train, X_val, y_val):
    cols = list(X_train.columns)
    total = cols.copy()
    best_val_acc = 0
    selected_vars = []
    best = []
    p = 0
    for i in range(len(total)):
        random_vars = []
        random_vars.append(total[i])
        #print(random_vars)
        while len(cols) > 0:
            candidate = None
            for i in range(len(cols)):
                current_vars = selected_vars.copy()
                if cols[i] == random_vars:
                    break
                else:
                    current_vars.append(cols[i])
                    if len(current_vars) == 1:
                        new_X_train = X_train[current_vars].values.reshape(-1, 1)
                        new_X_val = X_val[current_vars].values.reshape(-1, 1)
                    else:
                        new_X_train = X_train[current_vars]                
                        new_X_val = X_val[current_vars]





                    mod = LogisticRegression(C=1e9).fit(new_X_train, y_train)
                    val_rec = recall_score(y_val, mod.predict(new_X_val))
                    val_accu = accuracy_score(y_val, mod.predict(new_X_val))
                    val_acc = (val_rec + val_accu)/2
                    if val_acc - best_val_acc > 0.002:
                        candidate = cols[i]
                        best_val_acc = val_acc
            if candidate is not None:
                selected_vars.append(candidate)
                cols.remove(candidate)
            else:
                break
            #print('Columns in current model: {}'.format(', '.join(selected_vars)))
            #print('Best validation accuracy is {}'.format(np.round(best_val_acc, 3)))
        if p < best_val_acc:
            p = best_val_acc
            best = selected_vars
               
    return best
 
selected_columns = my_rwd_selector(X_train, y_train, X_val, y_val)
print(selected_columns)


model = LogisticRegression(C=1e9).fit(X_train[selected_columns], y_train)


y_train_predicted = model.predict(X_train[selected_columns])
y_val_predicted = model.predict(X_val[selected_columns])
y_test_predicted = model.predict(X_test[selected_columns])


training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))






['FamilyWork', 'Asian', 'Unemployment', 'Income', 'MeanCommute']
Training recall is:    0.8858294157152451
Validation recall is:  0.9558011049723757
Validation accuracy is:  0.7475083056478405
precision accuracy is:  0.7178423236514523
training weighted accuracy is:  0.7916454610827578
validation weighted accuracy is:  0.8516547053101081
test weighted accuracy is:    0.7962365591397849


In [201]:
#BACKWARD SELECT BALANCED



import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd



validation = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('TrainBalanced.csv')

print('The sizes for train, test, and validation should be {}'.format((len(train), len(validation), len(test))))


validation = validation.drop(['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian'],1)
test = test.drop(['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian'],1)


 
X_train_unstandard = train.drop(columns='ADeath')
X_train=(X_train_unstandard-X_train_unstandard.mean())/X_train_unstandard.std()
y_train = train['ADeath']

X_val_unstandard = validation.drop(columns='ADeath')
X_val= (X_val_unstandard - X_val_unstandard.mean())/X_val_unstandard.std()
y_val = validation['ADeath']

X_test_unstandard = test.drop(columns='ADeath')
X_test= (X_test_unstandard - X_test_unstandard.mean())/X_test_unstandard.std()
y_test = test['ADeath']



def my_bwd_selector(X_train, y_train, X_val, y_val):
    print('=============== Begining backwards selection =================')
    cols = list(X_train.columns)
    best_val_acc = 0
    selected_vars = ['TotalPop','Men','Women','Hispanic','White','Black','Native','Asian',
                         'Pacific','VotingAgeCitizen','Income','IncomeErr','IncomePerCap','IncomePerCapErr',  
                         'Poverty','ChildPoverty','Professional','Service','Office','Construction',
                         'Production','Drive','Carpool','Transit','Walk','OtherTransp','WorkAtHome',
                         'MeanCommute','Employed','PrivateWork','PublicWork','SelfEmployed','FamilyWork',
                         'Unemployment','Density per square mile of land area']
    #print(selected_vars)
    while len(cols) > 0:
        print('Trying {} var models'.format(len(selected_vars) + 1 ))
        candidate = None
        for i in range(len(cols)):
            current_vars = selected_vars.copy()
            #print(cols[i])
            current_vars.remove(cols[i])
            #print(current_vars)
            if len(current_vars) == 1:
                new_X_train = X_train[current_vars].values.reshape(-1, 1)
                new_X_val = X_val[current_vars].values.reshape(-1, 1)
            else:
                new_X_train = X_train[current_vars]                
                new_X_val = X_val[current_vars]
            
            mod = LogisticRegression(C=1e9).fit(new_X_train, y_train)
            val_rec = recall_score(y_val, mod.predict(new_X_val))
            val_accu = accuracy_score(y_val, mod.predict(new_X_val))
            val_acc = (val_rec + val_accu)/2
            if val_acc - best_val_acc > 0.002:
                candidate = cols[i]
                best_val_acc = val_acc
        if candidate is not None:
            selected_vars.remove(candidate)
            cols.remove(candidate)
            print('------- Removeing {} to the model ---------'.format(candidate))
        else:
            break
        print('Columns in current model: {}'.format(', '.join(selected_vars)))
        print('Best validation accuracy is {}'.format(np.round(best_val_acc, 3)))
        print(len(selected_vars))
    return selected_vars

 

selected_columns = my_bwd_selector(X_train, y_train, X_val, y_val)
model = LogisticRegression(C=1e9).fit(X_train[selected_columns], y_train)

 

y_train_predicted = model.predict(X_train[selected_columns])
y_val_predicted = model.predict(X_val[selected_columns])
y_test_predicted = model.predict(X_test[selected_columns])


training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))

The sizes for train, test, and validation should be (1828, 301, 300)
Trying 36 var models
------- Removeing Income to the model ---------
Columns in current model: TotalPop, Men, Women, Hispanic, White, Black, Native, Asian, Pacific, VotingAgeCitizen, IncomeErr, IncomePerCap, IncomePerCapErr, Poverty, ChildPoverty, Professional, Service, Office, Construction, Production, Drive, Carpool, Transit, Walk, OtherTransp, WorkAtHome, MeanCommute, Employed, PrivateWork, PublicWork, SelfEmployed, FamilyWork, Unemployment, Density per square mile of land area
Best validation accuracy is 0.746
34
Trying 35 var models
Training recall is:    0.7724288840262582
Validation recall is:  0.7403314917127072
Validation accuracy is:  0.7508305647840532
precision accuracy is:  0.8271604938271605
training weighted accuracy is:  0.7907549234135667
validation weighted accuracy is:  0.7455810282483801
test weighted accuracy is:    0.5994623655913978


In [202]:
#BACKWARDS FUCKNIG SELECTTTTTTTTTTTTTTTTTTTTT UNBALANCED


import warnings
warnings.filterwarnings("ignore")

 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd

 

df = pd.read_csv('finalcovid.csv')

 

train, other = train_test_split(df, test_size=0.2, random_state=0)
test, validation = train_test_split(other, test_size=0.5, random_state=0)

 

X_train = train.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_train = train['ADeath']
X_val = validation.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_val = validation['ADeath']
X_test = test.drop(columns=['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_test = test['ADeath']

 

X_train=(X_train-X_train.mean())/X_train.std()
X_val = (X_val-X_val.mean())/X_val.std()
X_test = (X_test-X_test.mean())/X_test.std()



def my_bwd_selector(X_train, y_train, X_val, y_val):
    print('=============== Begining backwards selection =================')
    cols = list(X_train.columns)
    best_val_acc = 0
    selected_vars = ['TotalPop','Men','Women','Hispanic','White','Black','Native','Asian',
                         'Pacific','VotingAgeCitizen','Income','IncomeErr','IncomePerCap','IncomePerCapErr',  
                         'Poverty','ChildPoverty','Professional','Service','Office','Construction',
                         'Production','Drive','Carpool','Transit','Walk','OtherTransp','WorkAtHome',
                         'MeanCommute','Employed','PrivateWork','PublicWork','SelfEmployed','FamilyWork',
                         'Unemployment','Density per square mile of land area']
    #print(selected_vars)
    while len(cols) > 0:
        print('Trying {} var models'.format(len(selected_vars) + 1 ))
        candidate = None
        for i in range(len(cols)):
            current_vars = selected_vars.copy()
            #print(cols[i])
            current_vars.remove(cols[i])
            #print(current_vars)
            if len(current_vars) == 1:
                new_X_train = X_train[current_vars].values.reshape(-1, 1)
                new_X_val = X_val[current_vars].values.reshape(-1, 1)
            else:
                new_X_train = X_train[current_vars]                
                new_X_val = X_val[current_vars]
            
            mod = LogisticRegression(C=1e9).fit(new_X_train, y_train)
            val_rec = recall_score(y_val, mod.predict(new_X_val))
            val_accu = accuracy_score(y_val, mod.predict(new_X_val))
            val_acc = (val_rec + val_accu)/2
            if val_acc - best_val_acc > 0.002:
                candidate = cols[i]
                best_val_acc = val_acc
        if candidate is not None:
            selected_vars.remove(candidate)
            cols.remove(candidate)
            print('------- Removeing {} to the model ---------'.format(candidate))
        else:
            break
        print('Columns in current model: {}'.format(', '.join(selected_vars)))
        print('Best validation accuracy is {}'.format(np.round(best_val_acc, 3)))
        print(len(selected_vars))
    return selected_vars

 

selected_columns = my_bwd_selector(X_train, y_train, X_val, y_val)
model = LogisticRegression(C=1e9).fit(X_train[selected_columns], y_train)

 


y_train_predicted = model.predict(X_train[selected_columns])
y_val_predicted = model.predict(X_val[selected_columns])
y_test_predicted = model.predict(X_test[selected_columns])


training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))

Trying 36 var models
------- Removeing Carpool to the model ---------
Columns in current model: TotalPop, Men, Women, Hispanic, White, Black, Native, Asian, Pacific, VotingAgeCitizen, Income, IncomeErr, IncomePerCap, IncomePerCapErr, Poverty, ChildPoverty, Professional, Service, Office, Construction, Production, Drive, Transit, Walk, OtherTransp, WorkAtHome, MeanCommute, Employed, PrivateWork, PublicWork, SelfEmployed, FamilyWork, Unemployment, Density per square mile of land area
Best validation accuracy is 0.838
34
Trying 35 var models
------- Removeing IncomeErr to the model ---------
Columns in current model: TotalPop, Men, Women, Hispanic, White, Black, Native, Asian, Pacific, VotingAgeCitizen, Income, IncomePerCap, IncomePerCapErr, Poverty, ChildPoverty, Professional, Service, Office, Construction, Production, Drive, Transit, Walk, OtherTransp, WorkAtHome, MeanCommute, Employed, PrivateWork, PublicWork, SelfEmployed, FamilyWork, Unemployment, Density per square mile of land area


In [205]:
#FORWAR SELECT BALANCED

import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd



validation = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('TrainBalanced.csv')


validation = validation.drop(['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian'],1)
test = test.drop(['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian'],1)


 
X_train_unstandard = train.drop(columns='ADeath')
X_train=(X_train_unstandard-X_train_unstandard.mean())/X_train_unstandard.std()
y_train = train['ADeath']

X_val_unstandard = validation.drop(columns='ADeath')
X_val= (X_val_unstandard - X_val_unstandard.mean())/X_val_unstandard.std()
y_val = validation['ADeath']

X_test_unstandard = test.drop(columns='ADeath')
X_test= (X_test_unstandard - X_test_unstandard.mean())/X_test_unstandard.std()
y_test = test['ADeath']

def my_fwd_selector(stand_X_train, y_train, stand_X_val, y_val):
    print('=============== Begining forward selection =================')
    cols = list(stand_X_train.columns)
    best_val_acc = 0
    selected_vars = []
    while len(cols) > 0:
        print('Trying {} var models'.format(len(selected_vars) + 1))
        candidate = None
        for i in range(len(cols)):
            current_vars = selected_vars.copy()
            current_vars.append(cols[i])
            if len(current_vars) == 1:
                new_X_train = stand_X_train[current_vars].values.reshape(-1, 1)
                new_X_val = stand_X_val[current_vars].values.reshape(-1, 1)
            else:
                new_X_train = stand_X_train[current_vars]                
                new_X_val = stand_X_val[current_vars]
            
            mod = LogisticRegression(C=1e9).fit(new_X_train, y_train)
            val_rec = recall_score(y_val, mod.predict(new_X_val))
            val_accu = accuracy_score(y_val, mod.predict(new_X_val))
            val_acc = (val_rec + val_accu)/2
            if val_acc - best_val_acc > 0.002:
                candidate = cols[i]
                best_val_acc = val_acc
        if candidate is not None:
            selected_vars.append(candidate)
            cols.remove(candidate)
            print('------- Adding {} to the model ---------'.format(candidate))
        else:
            break
        print('Columns in current model: {}'.format(', '.join(selected_vars)))
        print('Best validation accuracy is {}'.format(np.round(best_val_acc, 3)))
    return selected_vars


selected_columns = my_fwd_selector(X_train, y_train, X_val, y_val)

model = LogisticRegression(C=1e9).fit(X_train[selected_columns], y_train)

y_train_predicted = model.predict(X_train[selected_columns])
y_val_predicted = model.predict(X_val[selected_columns])
y_test_predicted = model.predict(X_test[selected_columns])




training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))






Trying 1 var models
------- Adding Native to the model ---------
Columns in current model: Native
Best validation accuracy is 0.756
Trying 2 var models
------- Adding OtherTransp to the model ---------
Columns in current model: Native, OtherTransp
Best validation accuracy is 0.778
Trying 3 var models
Training recall is:    0.8807439824945296
Validation recall is:  0.8950276243093923
Validation accuracy is:  0.6611295681063123
precision accuracy is:  0.6612244897959184
training weighted accuracy is:  0.7270240700218819
validation weighted accuracy is:  0.7780785962078522
test weighted accuracy is:    0.6093548387096774


In [220]:
#lASSOO SELECT UNBALANCE


import warnings
warnings.filterwarnings("ignore")

 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd

 

df = pd.read_csv('finalcovid.csv')

 

train, other = train_test_split(df, test_size=0.2, random_state=0)
test, validation = train_test_split(other, test_size=0.5, random_state=0)

 

X_train = train.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_train = train['ADeath']
X_val = validation.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_val = validation['ADeath']
X_test = test.drop(columns=['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_test = test['ADeath']

 

X_train=(X_train-X_train.mean())/X_train.std()
X_val = (X_val-X_val.mean())/X_val.std()
X_test = (X_test-X_test.mean())/X_test.std()
def my_lwd_selector(stand_X_train, y_train, stand_X_val, y_val):
    print('=============== Begining forward selection =================')
    cols = list(stand_X_train.columns)
    best_val_acc = 0
    selected_vars = []
    while len(cols) > 0:
        print('Trying {} var models'.format(len(selected_vars) + 1))
        candidate = None
        for i in range(len(cols)):
            current_vars = selected_vars.copy()
            current_vars.append(cols[i])
            if len(current_vars) == 1:
                new_X_train = stand_X_train[current_vars].values.reshape(-1, 1)
                new_X_val = stand_X_val[current_vars].values.reshape(-1, 1)
            else:
                new_X_train = stand_X_train[current_vars]                
                new_X_val = stand_X_val[current_vars]
            
            mod = LogisticRegression(C=0.5, penalty='l1', solver='liblinear').fit(new_X_train, y_train)
            val_rec = recall_score(y_val, mod.predict(new_X_val))
            val_accu = accuracy_score(y_val, mod.predict(new_X_val))
            val_acc = (val_rec + val_accu)/2
            if val_acc - best_val_acc > 0.002:
                candidate = cols[i]
                best_val_acc = val_acc
        if candidate is not None:
            selected_vars.append(candidate)
            cols.remove(candidate)
            print('------- Adding {} to the model ---------'.format(candidate))
        else:
            break
        print('Columns in current model: {}'.format(', '.join(selected_vars)))
        print('Best validation accuracy is {}'.format(np.round(best_val_acc, 3)))
    return selected_vars


selected_columns = my_lwd_selector(X_train, y_train, X_val, y_val)

model = LogisticRegression(C=1e9).fit(X_train[selected_columns], y_train)

y_train_predicted = model.predict(X_train[selected_columns])
y_val_predicted = model.predict(X_val[selected_columns])
y_test_predicted = model.predict(X_test[selected_columns])

training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))





Trying 1 var models
------- Adding FamilyWork to the model ---------
Columns in current model: FamilyWork
Best validation accuracy is 0.823
Trying 2 var models
------- Adding Asian to the model ---------
Columns in current model: FamilyWork, Asian
Best validation accuracy is 0.825
Trying 3 var models
------- Adding MeanCommute to the model ---------
Columns in current model: FamilyWork, Asian, MeanCommute
Best validation accuracy is 0.833
Trying 4 var models
------- Adding Unemployment to the model ---------
Columns in current model: FamilyWork, Asian, MeanCommute, Unemployment
Best validation accuracy is 0.844
Trying 5 var models
------- Adding IncomePerCap to the model ---------
Columns in current model: FamilyWork, Asian, MeanCommute, Unemployment, IncomePerCap
Best validation accuracy is 0.859
Trying 6 var models
Training recall is:    0.8824714573539288
Validation recall is:  0.9613259668508287
Validation accuracy is:  0.7475083056478405
precision accuracy is:  0.7160493827160493


In [204]:
#FORWARDS SELECT UNBALANCE


import warnings
warnings.filterwarnings("ignore")

 

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd

 

df = pd.read_csv('finalcovid.csv')

 

train, other = train_test_split(df, test_size=0.2, random_state=0)
test, validation = train_test_split(other, test_size=0.5, random_state=0)

 

X_train = train.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_train = train['ADeath']
X_val = validation.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_val = validation['ADeath']
X_test = test.drop(columns=['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_test = test['ADeath']

 

X_train=(X_train-X_train.mean())/X_train.std()
X_val = (X_val-X_val.mean())/X_val.std()
X_test = (X_test-X_test.mean())/X_test.std()


def my_fwd_selector(stand_X_train, y_train, stand_X_val, y_val):
    print('=============== Begining forward selection =================')
    cols = list(stand_X_train.columns)
    best_val_acc = 0
    selected_vars = []
    while len(cols) > 0:
        print('Trying {} var models'.format(len(selected_vars) + 1))
        candidate = None
        for i in range(len(cols)):
            current_vars = selected_vars.copy()
            current_vars.append(cols[i])
            if len(current_vars) == 1:
                new_X_train = X_train[current_vars].values.reshape(-1, 1)
                new_X_val = X_val[current_vars].values.reshape(-1, 1)
            else:
                new_X_train = stand_X_train[current_vars]                
                new_X_val = stand_X_val[current_vars]
            
            mod = LogisticRegression(C=1e9).fit(new_X_train, y_train)
            val_rec = recall_score(y_val, mod.predict(new_X_val))
            val_accu = accuracy_score(y_val, mod.predict(new_X_val))
            val_acc = (val_rec + val_accu)/2
            if val_acc - best_val_acc > 0.002:
                candidate = cols[i]
                best_val_acc = val_acc
        if candidate is not None:
            selected_vars.append(candidate)
            cols.remove(candidate)
            print('------- Adding {} to the model ---------'.format(candidate))
        else:
            break
        print('Columns in current model: {}'.format(', '.join(selected_vars)))
        print('Best validation accuracy is {}'.format(np.round(best_val_acc, 3)))
    return selected_vars


selected_columns = my_fwd_selector(X_train, y_train, X_val, y_val)

model = LogisticRegression(C=1e9).fit(X_train[selected_columns], y_train)

y_train_predicted = model.predict(X_train[selected_columns])
y_val_predicted = model.predict(X_val[selected_columns])
y_test_predicted = model.predict(X_test[selected_columns])

training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))







Trying 1 var models
------- Adding FamilyWork to the model ---------
Columns in current model: FamilyWork
Best validation accuracy is 0.823
Trying 2 var models
------- Adding Asian to the model ---------
Columns in current model: FamilyWork, Asian
Best validation accuracy is 0.825
Trying 3 var models
------- Adding Unemployment to the model ---------
Columns in current model: FamilyWork, Asian, Unemployment
Best validation accuracy is 0.832
Trying 4 var models
------- Adding Income to the model ---------
Columns in current model: FamilyWork, Asian, Unemployment, Income
Best validation accuracy is 0.848
Trying 5 var models
------- Adding MeanCommute to the model ---------
Columns in current model: FamilyWork, Asian, Unemployment, Income, MeanCommute
Best validation accuracy is 0.852
Trying 6 var models
Training recall is:    0.8858294157152451
Validation recall is:  0.9558011049723757
Validation accuracy is:  0.7475083056478405
precision accuracy is:  0.7178423236514523
training weighte

In [215]:
selected_columns
X = X_test[selected_columns]
y = y_test
mod = LogisticRegression().fit(X, y)
ModelSummary(mod, X, y).get_summary()

              coefficient    std  p-value  [0.025  0.975]
intercept           0.719  0.155    0.000   0.414   1.023
FamilyWork         -0.212  0.130    0.103  -0.467   0.044
Asian               1.442  0.500    0.004   0.458   2.426
Unemployment        0.333  0.160    0.038   0.018   0.648
Income              0.262  0.189    0.165  -0.110   0.635
MeanCommute         0.303  0.134    0.024   0.039   0.567
------------------------------------------------------------
Confusion Matrix (total:300) 	 Accuracy: 	  0.775
  TP: 162 | FN: 24
  FP: 72 | TN: 42


In [None]:
#SINGLE VECTOR UNBALNCED single




import warnings
warnings.filterwarnings("ignore")

 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd

 

df = pd.read_csv('finalcovid.csv')

 

train, other = train_test_split(df, test_size=0.2, random_state=0)
test, validation = train_test_split(other, test_size=0.5, random_state=0)

 

X_train = train.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_train = train['ADeath']
X_val = validation.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_val = validation['ADeath']
X_test = test.drop(columns=['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_test = test['ADeath']

 

X_train=(X_train-X_train.mean())/X_train.std()
X_val = (X_val-X_val.mean())/X_val.std()
X_test = (X_test-X_test.mean())/X_test.std()



model = SVC(C=1, kernel = 'linear', gamma = 0.005) 
model = model.fit(X_train, y_train)

ypred = model.predict(X_val)



training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))




In [225]:
#SINGLE VECTOR UNBALNCED poly


import warnings
warnings.filterwarnings("ignore")

 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score
import numpy as np
import pandas as pd

 

df = pd.read_csv('finalcovid.csv')

 

train, other = train_test_split(df, test_size=0.2, random_state=0)
test, validation = train_test_split(other, test_size=0.5, random_state=0)

 

X_train = train.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_train = train['ADeath']
X_val = validation.drop(columns = ['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_val = validation['ADeath']
X_test = test.drop(columns=['cases','deaths','date','CountyId','State','County','DeathMean', 'DeathMedian', 'ADeath'])
y_test = test['ADeath']

 

X_train=(X_train-X_train.mean())/X_train.std()
X_val = (X_val-X_val.mean())/X_val.std()
X_test = (X_test-X_test.mean())/X_test.std()



model = SVC(C=1, kernel = 'poly', gamma = 0.005) 
model = model.fit(X_train, y_train)


y_train_predicted = model.predict(X_train)
y_val_predicted = model.predict(X_val)
y_test_predicted = model.predict(X_test)

training_acc = (accuracy_score(y_train, y_train_predicted) + recall_score(y_train, y_train_predicted))/2
weighted_acc = (accuracy_score(y_val, y_val_predicted) + recall_score(y_val, y_val_predicted))/2
test_acc = (accuracy_score(y_test, y_test_predicted) + recall_score(y_test, y_test_predicted))/2



print('======= Accuracy  table =======')
print('Training recall is:    {}'.format(recall_score(y_train, y_train_predicted)))
print('Validation recall is:  {}'.format(recall_score(y_val, y_val_predicted)))
print('Validation accuracy is:  {}'.format(accuracy_score(y_val, y_val_predicted)))
print('precision accuracy is:  {}'.format(precision_score(y_val, y_val_predicted)))
print('training weighted accuracy is:  {}'.format(training_acc))
print('validation weighted accuracy is:  {}'.format(weighted_acc))
print('test weighted accuracy is:    {}'.format(test_acc))






Training recall is:    0.9986568166554735
Validation recall is:  1.0
Validation accuracy is:  0.6179401993355482
precision accuracy is:  0.6114864864864865
training weighted accuracy is:  0.818928907703517
validation weighted accuracy is:  0.8089700996677741
test weighted accuracy is:    0.8216666666666667


SyntaxError: invalid syntax (<ipython-input-226-8cd7f752079a>, line 1)

In [None]:
confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)