In [None]:
import numpy as np
import pandas as pd
import requests
import io
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import seaborn as sns   
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
from tabulate import tabulate
from sklearn.metrics import precision_score, average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve, roc_auc_score
from scipy import stats
from sklearn import metrics
from utils import fairness_metrics
#from EXPERIMENTS_WITH_PROBS import plot_PR_curve
import os
import warnings
import shutil

In [None]:
###
warnings.filterwarnings(action='once')
url = "https://cdn.jsdelivr.net/gh/ramenfeast/BV-ethnicity-report/BV%20Dataset%20copy.csv"
download = requests.get(url).content
df = pd.read_csv(io.StringIO(download.decode('utf-8')))

#%%Clean data for all
df = df.drop([394,395,396], axis = 0)
df.loc[df['Nugent score'] <7, 'Nugent score'] = 0
df.loc[df['Nugent score'] >=7, 'Nugent score'] = 1

df['pH']=df['pH']/14

#all ethnic
dfall = df
#dfall=dfall.drop(labels= ['Ethnic Groupa', 'Community groupc '], axis=1)
dfall=dfall.drop(labels= ['Community groupc '], axis=1)
dfall.iloc[:,1:-1]=dfall.iloc[:,1:-1]/100
Xall = dfall.iloc[:,:-1]
yall = dfall.iloc[:,-1]

#print(df.shape)
#X_trainall, X_testall, y_trainall, y_testall = train_test_split(Xall, yall,test_size=0.2)

#white dataframe
dfwhite = df[df["Ethnic Groupa"] == 'White'] 
#dfwhite =dfwhite.drop(labels= ['Ethnic Groupa', 'Community groupc '], axis=1)
dfwhite =dfwhite.drop(labels= ['Community groupc '], axis=1)
dfwhite.iloc[:,1:-1]=dfwhite.iloc[:,1:-1]/100
Xwhite = dfwhite.iloc[:,:-1]
ywhite = dfwhite.iloc[:,-1]

#X_trainwhite, X_testwhite, y_trainwhite, y_testwhite = train_test_split(Xwhite, ywhite,test_size=0.2)

#blackdataframe
dfblack = df[df["Ethnic Groupa"] == 'Black']
#dfblack=dfblack.drop(labels= ['Ethnic Groupa', 'Community groupc '], axis=1)
dfblack=dfblack.drop(labels= ['Community groupc '], axis=1)
dfblack.iloc[:,1:-1]=dfblack.iloc[:,1:-1]/100
Xblack = dfblack.iloc[:,:-1]
yblack = dfblack.iloc[:,-1]

#X_trainblack, X_testblack, y_trainblack, y_testblack = train_test_split(Xblack, yblack,test_size=0.2)

#asian dataframe
dfasian = df[df["Ethnic Groupa"] == 'Asian']
#dfasian=dfasian.drop(labels= ['Ethnic Groupa', 'Community groupc '], axis=1)
dfasian=dfasian.drop(labels= ['Community groupc '], axis=1)
dfasian.iloc[:,1:-1]=dfasian.iloc[:,1:-1]/100
Xasian = dfasian.iloc[:,:-1]
yasian = dfasian.iloc[:,-1]

#X_trainasian, X_testasian, y_trainasian, y_testasian = train_test_split(Xasian, yasian,test_size=0.2)

#hispanic dataframe
dfhispanic = df[df["Ethnic Groupa"] == 'Hispanic']
#dfhispanic=dfhispanic.drop(labels= ['Ethnic Groupa', 'Community groupc '], axis=1)
dfhispanic=dfhispanic.drop(labels= ['Community groupc '], axis=1)
dfhispanic.iloc[:,1:-1]=dfhispanic.iloc[:,1:-1]/100
Xhispanic = dfhispanic.iloc[:,:-1]
yhispanic = dfhispanic.iloc[:,-1]

#X_trainhispanic, X_testhispanic, y_trainhispanic, y_testhispanic = train_test_split(Xhispanic, yhispanic,test_size=0.2)

#creating dictionary of accuracy scores
feataccdict = {"All features all ethnicities accuracy": [], "All features white only accuracy": [], 
               "All features black only accuracy": [], "All features asian only accuracy": [], 
               "All features hispanic only accuracy":[], "Ftest all ethnicities accuracy": [], "Ftest white only accuracy": [], 
               "Ftest black only accuracy": [], "Ftest asian only accuracy": [], 
               "Ftest hispanic only accuracy":[], "Sig features all ethnicities accuracy": [], "Sig features white only accuracy": [], 
                  "Sig features black only accuracy": [], "Sig features asian only accuracy": [], 
                  "Sig features hispanic only accuracy":[], "Corr features all ethnicities accuracy": [], "Corr features white only accuracy": [], 
               "Corr features black only accuracy": [], "Corr features asian only accuracy": [], 
               "Corr features hispanic only accuracy":[], "Ttest features all ethnicities accuracy": [], "Ttest features white only accuracy": [], 
                "Ttest features black only accuracy": [], "Ttest features asian only accuracy": [], 
                "Ttest features hispanic only accuracy":[], "Gini features all ethnicities accuracy": [], "Gini features white only accuracy": [], 
               "Gini features black only accuracy": [], "Gini features asian only accuracy": [], 
               "Gini features hispanic only accuracy":[]}

fprdict = {}
tprdict = {}
aucdict = {}

dfalltandf = pd.DataFrame(columns=['Features', 'Feature Test', 'Value'])
bestauc = 0

#deletes fairness metric csv
proballfeat = 'prob_allfeatures.csv'
probftest = 'prob_ftest.csv'
probttest = 'prob_ttest.csv'
probsigpb = 'prob_sigpb.csv'
probcorrpb = 'prob_corrpb.csv'
probgini = 'prob_gini.csv'

files_to_delete = [proballfeat, probftest, probttest, probsigpb, probcorrpb, probgini]

# Loop over the list of files and delete each one
for file in files_to_delete:
  try:
    os.remove(file)
    print(file, "deleted")
  except:
    print(file, "not found")

prcurvedirectory = "PRcurves"
try:
    shutil.rmtree(prcurvedirectory)
except FileNotFoundError:
    print(f"Directory {prcurvedirectory} not found")
try:
    os.mkdir(prcurvedirectory)
except FileExistsError:
    print(f"Directory {prcurvedirectory} already exists")


fairnessfile = 'fairness_metric_data.csv'
try:
  os.remove(fairnessfile)
  print(fairnessfile,"deleted")
except:
  print(fairnessfile, "not found")

auccombinedfile = 'Feature_auc_combined.csv'
try:
  os.remove(auccombinedfile)
  print("file deleted")
except:
  print("file not found")

fprtprcombinedfile = 'Feature_fpr_tpr_combined.csv'
try:
  os.remove(fprtprcombinedfile)
  print("file deleted")
except:
  print("file not found")

#tpr and fpr directory for each model and set
tprfprdirectory = "Feature_tpr_fpr"
try:
    shutil.rmtree(tprfprdirectory)
except FileNotFoundError:
    print(f"Directory {tprfprdirectory} not found")

try:
    os.mkdir(tprfprdirectory)
except FileExistsError:
    print(f"Directory {tprfprdirectory} already exists")

#feature set directories
featsetdirectory = "Feature_Sets"
try:
    shutil.rmtree(featsetdirectory)
except FileNotFoundError:
    print(f"Directory {featsetdirectory} not found")

try:
    os.mkdir(featsetdirectory)
except FileExistsError:
    print(f"Directory {featsetdirectory} already exists")

#aucdirectory
aucdirectory = "Feature_auc"
try:
    shutil.rmtree(aucdirectory)
except FileNotFoundError:
    print(f"Directory {aucdirectory} not found")

try:
    os.mkdir(aucdirectory)
except FileExistsError:
    print(f"Directory {aucdirectory} already exists")


In [None]:
def RFtrain_test_1(px,py,featset, dictionary, testname, efeatset, modeltype='Random Forest'):
    global bestauc
    bestauc = 0
    acclist = []
    preclist =[]
    reclist = []
    f1list =[]
    fprlist = []
    tprlist = []
    auclist = []
    
    #repeat RF runs 10 times
    for x in range(10):
        pxtrain, pxtest, pytrain, pytest = train_test_split(px,py,test_size=0.2,random_state = x)
        
        #take ethnic group out of pxtrain
        dfethxtrain = pxtrain["Ethnic Groupa"]

        dfethxtest = pxtest["Ethnic Groupa"]

        arrayethxtest = dfethxtest.to_numpy()
        #print(arrayethxtest)
        
        pxtrain = pxtrain.drop(labels= ["Ethnic Groupa"], axis=1)
        pxtest = pxtest.drop(labels= ["Ethnic Groupa"], axis=1)

        #print(type(pytest))

        if modeltype == 'Logistic Regression':
            clf = LogisticRegression()
            clf.fit(pxtrain, pytrain)
            y_pred = clf.predict(pxtest)
        elif modeltype == 'SVM':
            clf = SVC(probability=True)
            clf.fit(pxtrain, pytrain)
            y_pred = clf.predict(pxtest)
        else:
            clf = RandomForestClassifier(n_estimators = 100)
            clf.fit(pxtrain, pytrain)
            y_pred = clf.predict(pxtest)

        acclist.append(accuracy_score(pytest,y_pred))
        preclist.append(precision_score(pytest,y_pred))
        reclist.append(recall_score(pytest,y_pred))
        f1list.append(f1_score(pytest,y_pred))

        ytestarray = pytest.to_numpy()
        #print(type(y_pred))
        #print(type(y_pred))
        #print(type(arrayethxtest))

        #do fairness metric and save to csv
        dffairreturn = fairness_metrics(ytestarray, y_pred, arrayethxtest)
        dffairreturn.insert(loc=0, column= 'Model_Type_with_Feature_Set', value= modeltype+'_'+featset)

        pred_probs = clf.predict_proba(pxtest)[:,1]
        predictions = y_pred >= 0.5
        dffairreturn["AUC"] = roc_auc_score(pytest, pred_probs)
        dffairreturn["f1"]= f1_score(pytest,predictions)
        dffairreturn["precision"] = precision_score(pytest,predictions)
        dffairreturn["recall"] = recall_score(pytest,predictions)
        dffairreturn["AP"] = average_precision_score(pytest, pred_probs)

        for race in ["Asian", "Black", "Hispanic", "White"]:
            dffairreturn[f"AP {race}"] = average_precision_score(pytest[arrayethxtest==race],pred_probs[arrayethxtest==race])
    


        #display(dffairreturn)
        if not os.path.isfile(fairnessfile):
            dffairreturn.to_csv(fairnessfile, index=False)
        else:
            dffairreturn.to_csv(fairnessfile, mode='a', index=False, header=False)
            
        prdf = pd.DataFrame()
        prdf['y_true'] = pytest
        prdf['probs'] = pred_probs
        prdf['fold'] = x
        prdf['run'] = x
        prdf['Race'] = efeatset


        prcurvedata = ''
        if modeltype == 'SVM':
            if testname == 'allfeatures':
                prcurvedata = proballfeat
            elif testname == 'ftest':
                prcurvedata = probftest
            elif testname == 'ttest':
                prcurvedata = probttest
            elif testname == 'pbsig':
                prcurvedata = probsigpb
            elif testname == 'pbcorr':
                prcurvedata = probcorrpb
            elif testname == 'gini':
                prcurvedata = probgini
            print('prcurvedata ', prcurvedata)

            if not os.path.isfile(prcurvedata):
                prdf.to_csv(prcurvedata , index=False)
            else:
                prdf.to_csv(prcurvedata , mode='a', index=False, header=False)
        
        

        

        fpr, tpr, threshold = metrics.roc_curve(pytest, pred_probs)
        auc = metrics.roc_auc_score(pytest, pred_probs)
        auclist.append(auc)

        #pytest is ytrue

        #plot_PR_curve(pytest, pred_probs, 'Precisionrecallcurvedata.csv')


        #print(tpr)
        #print(fpr)
        #print(auc)
        
        if auc > bestauc:
            fprdict[modeltype+'_'+featset+' fpr'] = fpr
            tprdict[modeltype+'_'+featset+' tpr'] = tpr
            aucdict[modeltype+'_'+featset+ ' auc'] = auc
            bestauc = auc
        #print(auc)
    
    #print(tprdict)
    #print(fprdict)

    
    #printing out results the mean of 10 runs for each metric   
    arr = np.array(acclist)
    arrmean = np.mean(arr)
    print(modeltype, featset,' RF acc average =', arrmean)
    precarr = np.array(preclist)
    precarrmean = np.mean(precarr)
    print(modeltype, featset,' Precision RF average =', precarrmean)
    recarr = np.array(reclist)
    recarrmean = np.mean(recarr)
    print(modeltype, featset,' Recall RF average =', recarrmean)
    f1arr = np.array(f1list)
    f1arrmean = np.mean(f1arr)
    print(modeltype, featset,' F1 score RF average =', f1arrmean)
    print('Training dataset count:', pxtrain.shape[0])
    print('Test dataset count:', pxtest.shape[0])
    
    
    aucarr = np.array(auclist)
    aucmean = np.mean(aucarr)
    #aucmeandict[featset+' auc'] = aucmean

    
    #return featset, auc, fpr, tpr

    #create ROC curve
    bestfpr = fprdict[modeltype+'_'+featset+' fpr']
    besttpr = tprdict[modeltype+'_'+featset+' tpr']
    bestauc = aucdict[modeltype+'_'+featset+ ' auc']

    #print(bestauc)
    #print(aucdict)


    dftpr = pd.DataFrame({'Best TPR': besttpr})
    dffpr = pd.DataFrame({'Best FPR': bestfpr})
    dftprfpr = pd.concat([dffpr['Best FPR'], dftpr['Best TPR']], axis=1)

    #display(dftpr)
    #display(dffpr)
    #display(dftprfpr)

    dftprfpr.to_csv(tprfprdirectory+'/'+modeltype+'_'+featset+'tprfpr.csv', index=False)

    #csv for combined fprtpr
    dftprfpr.insert(loc=0, column= 'Model_Type_with_Feature_Set', value= modeltype+'_'+featset)
    if not os.path.isfile(fprtprcombinedfile):
        dftprfpr.to_csv(fprtprcombinedfile, index=False)
    else:
        dftprfpr.to_csv(fprtprcombinedfile, mode='a', index=False, header=False)

    #individual csv for bestauc and combined auc
    pd.DataFrame([bestauc]).to_csv(aucdirectory+'/'+modeltype+'_'+featset+' auc.csv', index=False, header=['Best AUC'])
    dfauc = pd.DataFrame({'Best AUC': [bestauc]})
    dfauc.insert(loc=0, column= 'Model_Type_with_Feature_Set', value= modeltype+'_'+featset)
    if not os.path.isfile(auccombinedfile):
        dfauc.to_csv(auccombinedfile, index=False)
    else:
        dfauc.to_csv(auccombinedfile, mode='a', index=False, header=False)

    plt.title(str(modeltype)+'_'+'Receiver Operator Curve for:' +str(featset))
    plt.plot(bestfpr,besttpr,label="auc="+str(aucdict[modeltype+'_'+featset+' auc']))
    plt.legend(loc = 'lower right')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

    #metrics.plot_roc_curve(clfrf, pxtest, pytest) 
    
    #attach accuracy list to each key in dictionary
    if dictionary == 'feataccdict' and featset == 'All features': 
        feataccdict["All features all ethnicities accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'All features White': 
        feataccdict["All features white only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'All features Black': 
        feataccdict["All features black only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'All features Asian': 
        feataccdict["All features asian only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'All features Hispanic': 
        feataccdict["All features hispanic only accuracy"] = acclist
        
    if dictionary == 'feataccdict' and featset == 'Ftest features with all ethnicities': 
        feataccdict["Ftest all ethnicities accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ftest features with only white': 
        feataccdict["Ftest white only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ftest features with only black': 
        feataccdict["Ftest black only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ftest features with only asian': 
        feataccdict["Ftest asian only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ftest features with only hispanic': 
        feataccdict["Ftest hispanic only accuracy"] = acclist
        
    if dictionary == 'feataccdict' and featset == 'Correlated PBtest features with all ethnicities': 
        feataccdict["Corr features all ethnicities accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Correlated PBtest features with white only': 
        feataccdict["Corr features white only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Correlated PBtest features with black only': 
        feataccdict["Corr features black only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Correlated PBtest features with asian only': 
        feataccdict["Corr features asian only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Correlated PBtest features with hispanic only': 
        feataccdict["Corr features hispanic only accuracy"] = acclist
    
    if dictionary == 'feataccdict' and featset == 'Significant PBtest features with all ethnicities': 
        feataccdict["Sig features all ethnicities accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Significant PBtest features with white only': 
        feataccdict["Sig features white only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Significant PBtest features with black only': 
        feataccdict["Sig features black only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Significant PBtest features with asian only': 
        feataccdict["Sig features asian only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Significant PBtest features with hispanic only': 
        feataccdict["Sig features hispanic only accuracy"] = acclist
        
        
    if dictionary == 'feataccdict' and featset == 'Ttest features with all ethnicities': 
        feataccdict["Ttest features all ethnicities accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ttest features with white only': 
        feataccdict["Ttest features white only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ttest features with black only': 
        feataccdict["Ttest features black only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ttest features with asian only': 
        feataccdict["Ttest features asian only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Ttest features with hispanic only': 
        feataccdict["Ttest features hispanic only accuracy"] = acclist
        
        
    if dictionary == 'feataccdict' and featset == 'Gini features with all ethnicities': 
        feataccdict["Gini features all ethnicities accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Gini features with only white': 
        feataccdict["Gini features white only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Gini features with only black': 
        feataccdict["Gini features black only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Gini features with only asian': 
        feataccdict["Gini features asian only accuracy"] = acclist
    if dictionary == 'feataccdict' and featset == 'Gini features with only hispanic': 
        feataccdict["Gini features hispanic only accuracy"] = acclist
    
        

In [None]:
#Test all features all ethnicities
featset = RFtrain_test_1(Xall,yall, 'All features','feataccdict', 'allfeatures', 'All ethnicities')
print()
#Test normal white only
#display(y_testwhite)
'''
RFtrain_test_1(Xwhite,ywhite, 'All features White', 'feataccdict')
print()
#Test normal black only
RFtrain_test_1(Xblack,yblack, 'All features Black','feataccdict')
print()
##Test normal asian only
RFtrain_test_1(Xasian,yasian, 'All features Asian','feataccdict')
print()
#Test normal hispanic only
RFtrain_test_1(Xhispanic,yhispanic, 'All features Hispanic','feataccdict')
'''

In [None]:
#Logistic Regression
#Test all features all ethnicities
featset = RFtrain_test_1(Xall,yall, 'All features', 'feataccdict', 'allfeatures', 'All features', 'Logistic Regression')
print()
#Test normal white only
#display(y_testwhite)
'''
RFtrain_test_1(Xwhite,ywhite, 'All features White', 'feataccdict', 'Logistic Regression')
print()
#Test normal black only
RFtrain_test_1(Xblack,yblack, 'All features Black','feataccdict','Logistic Regression')
print()
##Test normal asian only
RFtrain_test_1(Xasian,yasian, 'All features Asian','feataccdict', 'Logistic Regression')
print()
#Test normal hispanic only
RFtrain_test_1(Xhispanic,yhispanic, 'All features Hispanic','feataccdict', 'Logistic Regression')
'''

In [None]:
#SVM
#Test all features all ethnicities
featset = RFtrain_test_1(Xall,yall, 'All features', 'feataccdict', 'allfeatures','All ethnicities', 'SVM')
print()
#Test normal white only
#display(y_testwhite)
'''
RFtrain_test_1(Xwhite,ywhite, 'All features White', 'feataccdict', 'SVM')
print()
#Test normal black only
RFtrain_test_1(Xblack,yblack, 'All features Black','feataccdict', 'SVM')
print()
##Test normal asian only
RFtrain_test_1(Xasian,yasian, 'All features Asian','feataccdict', 'SVM')
print()
#Test normal hispanic only
RFtrain_test_1(Xhispanic,yhispanic, 'All features Hispanic','feataccdict', 'SVM')
'''

In [None]:
#create ROC curve for RF all feat

plt.figure(0).clf()
plt.plot(fprdict['Random Forest_All features fpr'],tprdict['Random Forest_All features tpr'],label="All ethnicities auc="+str(aucdict['Random Forest_All features auc']),linestyle='solid')
'''
plt.plot(fprdict['Random Forest_All features White fpr'],tprdict['Random Forest_All features White tpr'],label="White auc="+str(aucdict['Random Forest_All features White auc']), linestyle='dotted')
plt.plot(fprdict['Random Forest_All features Black fpr'],tprdict['Random Forest_All features Black tpr'],label="Black auc="+str(aucdict['Random Forest_All features Black auc']),linestyle='dashdot')
plt.plot(fprdict['Random Forest_All features Asian fpr'],tprdict['Random Forest_All features Asian tpr'],label="Asian auc="+str(aucdict['Random Forest_All features Asian auc']),linestyle='dashed')
plt.plot(fprdict['Random Forest_All features Hispanic fpr'],tprdict['Random Forest_All features Hispanic tpr'],label="Hispanic auc="+str(aucdict['Random Forest_All features Hispanic auc']),linestyle='dotted')
'''

plt.title('Receiver Operator Curve for:' +str('All Features'))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')



In [None]:
#turning dictionary into dataframe and then making boxplot
alleth = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feataccdict.items() ]))
allfeatdf = alleth[["All features all ethnicities accuracy","All features white only accuracy", "All features black only accuracy", "All features asian only accuracy", "All features hispanic only accuracy"]]
#display(alleth)
display(allfeatdf)

fig = sns.boxplot(data=allfeatdf)
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.xlabel("All features")
plt.ylabel("Accuracy")
plt.title("Accuracy with all features")
plt.show(fig)

In [None]:
#ftest feature test so compares population variances and finds significant differences#
def Bestftest_1(px,py,featset, xall, yall, efeatset, model = 'Random Forest'):
    global dfalltandf
    
    fvalue_Best = SelectKBest(f_classif, k=50)

    #7-10 stores ethnic and drops it before fit
    dfethx = xall["Ethnic Groupa"]
    xall = xall.drop(labels= ["Ethnic Groupa"], axis=1)
    px = px.drop(labels= ["Ethnic Groupa"], axis=1)

    fvalue_Best.fit(px, py)
    scores = fvalue_Best.scores_
    
    #created new train and test sets
    cols = fvalue_Best.get_support(indices=True)
    #print(cols)
    #display(xall)
    #display(px)

    features_df_newx = xall.iloc[:,cols]
    
    #add ethnic group back on
    #features_df_newx["Ethnic Groupa"] = dfethx

    #display(features_df_newx)

    
    #table of features with fvalue
    dfimpfeatfval = pd.DataFrame(columns = ['Feature Name', 'F-score'])
    for (index, colname) in enumerate(features_df_newx):
        dfimpfeatfval = dfimpfeatfval.append({'Feature Name': colname, 'F-score': scores[index]}, ignore_index=True)
        dfimpfeatfval = dfimpfeatfval.sort_values(by = ['F-score'], ascending = False)
    with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):

        display(dfimpfeatfval)
    
    gname = dfimpfeatfval['Feature Name']
    gval = dfimpfeatfval['F-score']
    
    dfftest = dfimpfeatfval[['Feature Name']].copy()
    dfftest.rename(columns={'Feature Name': 'Features'}, inplace=True)
    
    dfftesttemp = dfftest
    
    #display(dfftest)
    
    dfftesttemp['Feature Test'] = featset
    dfftesttemp['Value'] = 1
    
    dfalltandf = dfalltandf[dfalltandf['Feature Test'] != featset]
    
    dfalltandf = pd.concat([dfalltandf, dfftesttemp])
    #display(dfalltandf)

    dfimpfeatfval.to_csv(featsetdirectory+'/'+featset+".csv", index=False)
    
    #sns.set(rc={'figure.figsize':(100,50)})
    #sns.set(font_scale=4)
    #sns.barplot(x = gval, y = gname)
    
    features_df_newx['Ethnic Groupa'] = dfethx
    
    display(features_df_newx)
    if model == 'Logistic Regression':
        RFtrain_test_1(features_df_newx, yall, featset,'feataccdict', 'ftest', efeatset, model)
    elif model == 'SVM':
        RFtrain_test_1(features_df_newx, yall, featset,'feataccdict', 'ftest', efeatset, model)
    elif model == 'Random Forest':
        RFtrain_test_1(features_df_newx, yall, featset,'feataccdict', 'ftest', efeatset)
    return gname,gval

In [None]:
#Test Ftest all ethnicities, RF
#Bestftest(X_trainall, X_testall, y_trainall, y_testall, 'Ftest features with all ethnicities')
gname, gval = Bestftest_1(Xall,yall, 'Ftest features with all ethnicities',Xall, yall, 'All ethnicities')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
sns.barplot(x = gval, y = gname)
display(dfalltandf)

In [None]:
#Test Ftest white only RF
gname, gval = Bestftest_1(Xwhite, ywhite, 'Ftest features with only white',Xall, yall, 'White')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
sns.barplot(x = gval, y = gname)
display(dfalltandf)

In [None]:
#Test Ftest black only RF
gname, gval = Bestftest_1(Xblack,yblack,'Ftest features with only black', Xall, yall, 'Black')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
sns.barplot(x = gval, y = gname)

In [None]:
#Test Ftest asian only RF
gname, gval =Bestftest_1(Xasian,yasian,'Ftest features with only asian', Xall, yall, 'Asian')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
sns.barplot(x = gval, y = gname)

In [None]:
#Test Ftest hispanic only RF
gname, gval =Bestftest_1(Xhispanic,yhispanic,'Ftest features with only hispanic', Xall, yall, 'Hispanic')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
sns.barplot(x = gval, y = gname)

In [None]:
#create ROC curve for RF Ftest

plt.figure(0).clf()
plt.plot(fprdict['Random Forest_Ftest features with all ethnicities fpr'],tprdict['Random Forest_Ftest features with all ethnicities tpr'],label="All ethnicities features auc="+str(aucdict['Random Forest_Ftest features with all ethnicities auc']),linestyle='solid')
plt.plot(fprdict['Random Forest_Ftest features with only white fpr'],tprdict['Random Forest_Ftest features with only white tpr'],label="White features auc="+str(aucdict['Random Forest_Ftest features with only white auc']), linestyle='dotted')
plt.plot(fprdict['Random Forest_Ftest features with only black fpr'],tprdict['Random Forest_Ftest features with only black tpr'],label="Black features auc="+str(aucdict['Random Forest_Ftest features with only black auc']),linestyle='dashdot')
plt.plot(fprdict['Random Forest_Ftest features with only asian fpr'],tprdict['Random Forest_Ftest features with only asian tpr'],label="Asian features auc="+str(aucdict['Random Forest_Ftest features with only asian auc']),linestyle='dashed')
plt.plot(fprdict['Random Forest_Ftest features with only hispanic fpr'],tprdict['Random Forest_Ftest features with only hispanic tpr'],label="Hispanic features auc="+str(aucdict['Random Forest_Ftest features with only hispanic auc']),linestyle='dotted')

plt.title('Receiver Operator Curve for:' +str('All Features'))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')



In [None]:
#LR
Bestftest_1(Xall,yall,'Ftest features with all ethnicities', Xall, yall,'All ethnicities', 'Logistic Regression')
Bestftest_1(Xwhite,ywhite,'Ftest features with only white', Xall, yall,'White','Logistic Regression')

In [None]:
Bestftest_1(Xblack,yblack,'Ftest features with only black', Xall, yall,'Black', 'Logistic Regression')
Bestftest_1(Xasian,yasian,'Ftest features with only asian', Xall, yall,'Asian','Logistic Regression')
Bestftest_1(Xhispanic,yhispanic,'Ftest features with only hispanic', Xall, yall,'Hispanic','Logistic Regression')

In [None]:
#SVM
Bestftest_1(Xall,yall,'Ftest features with all ethnicities', Xall, yall,'All ethnicities','SVM')
Bestftest_1(Xwhite,ywhite,'Ftest features with only white', Xall, yall,'White','SVM')


In [None]:
Bestftest_1(Xblack,yblack,'Ftest features with only black', Xall, yall,'Black','SVM')
Bestftest_1(Xasian,yasian,'Ftest features with only asian', Xall, yall,'Asian','SVM')
Bestftest_1(Xhispanic,yhispanic,'Ftest features with only hispanic', Xall, yall,'Hispanic','SVM')

In [None]:
#pivot table
table = pd.pivot_table(dfalltandf, values='Value', index=['Feature Test'],
                    columns=['Features'], aggfunc=np.sum, fill_value=0)
display(table)

In [None]:
#creating boxplot
alleth = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feataccdict.items() ]))
ftdf = alleth[["Ftest all ethnicities accuracy","Ftest white only accuracy", "Ftest black only accuracy", "Ftest asian only accuracy", "Ftest hispanic only accuracy"]]
display(ftdf)
display(alleth)

fig = sns.boxplot(data=ftdf)
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.xlabel("All features")
plt.ylabel("Accuracy")
plt.title("Ftest Accuracy")
plt.show(fig)

In [None]:
#correlation test between a binary variable and a continuous variable
def BestPB_1(px,py,featset,sigfeatset,xall,yall, efeatset, model = 'Random Forest'):
    from scipy import stats
    global dfalltandf
    #pointbiserial
    #used when one variable is interval and other variable has only 2 possible variables
    significantlist = []
    correlationlist = []
    dfimpfeatsig = pd.DataFrame(columns = ['Feature Name', 'P-value'])
    dfimpfeatcorr = pd.DataFrame(columns = ['Feature Name', 'Correlation Coefficient'])

    #7-10 stores ethnic and drops it before fit
    dfethx = px["Ethnic Groupa"]
    px = px.drop(labels= ["Ethnic Groupa"], axis=1)

    
    for (columnName, X_trainfeature) in px.iteritems():
        #point biserial 
        correlation_value,p_value= stats.pointbiserialr(X_trainfeature, py)
        

        alpha = 0.2
        #restricted p value by alpha
        if p_value < alpha:
            significantlist.append(columnName)
            dfimpfeatsig = dfimpfeatsig.append({'Feature Name': columnName, 'P-value': p_value}, ignore_index=True)
            #if between .5 and 1 is strongly correlated
            #further restricted by correlation value
            if correlation_value > 0.5: 
                correlationlist.append(columnName)
                dfimpfeatcorr = dfimpfeatcorr.append({'Feature Name': columnName, 'Correlation Coefficient': correlation_value}, ignore_index=True)
    dfimpfeatsig = dfimpfeatsig.sort_values(by = ['P-value'], ascending = True)
    dfimpfeatcorr = dfimpfeatcorr.sort_values(by = ['Correlation Coefficient'], ascending = False)

    with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):

        display(dfimpfeatsig)
        display(dfimpfeatcorr)
    
    
    gnames = dfimpfeatsig['Feature Name']
    gvals = dfimpfeatsig['P-value']
    gnamec = dfimpfeatcorr['Feature Name']
    gvalc = dfimpfeatcorr['Correlation Coefficient']
    
    
    dfsig = dfimpfeatsig[['Feature Name']].copy()
    dfcorr = dfimpfeatcorr[['Feature Name']].copy()
    
    dfsig.rename(columns={'Feature Name': 'Features'}, inplace=True)
    dfcorr.rename(columns={'Feature Name': 'Features'}, inplace=True)
    
    dfsigtemp = dfsig
    dfcorrtemp = dfcorr
    
    #display(dfftest)
    
    dfsigtemp['Feature Test'] = sigfeatset
    dfsigtemp['Value'] = 1
    
    dfcorrtemp['Feature Test'] = featset
    dfcorrtemp['Value'] = 1
    
    dfalltandf = dfalltandf[dfalltandf['Feature Test'] != featset]
    dfalltandf = dfalltandf[dfalltandf['Feature Test'] != sigfeatset]
    
    dfalltandf = pd.concat([dfalltandf, dfsigtemp])
    dfalltandf = pd.concat([dfalltandf, dfcorrtemp])
    
    dfimpfeatsig.to_csv(featsetdirectory+'/'+sigfeatset+".csv", index=False)
    dfimpfeatcorr.to_csv(featsetdirectory+'/'+featset+".csv", index=False)
    
    
    #significant (only using p value)
    significantlist.append("Ethnic Groupa")
    px_sig = xall[significantlist]
    display(px_sig)
    #px_sig["Ethnic Groupa"] = dfethx
    if model == 'Logistic Regression':
        RFtrain_test_1(px_sig, yall, sigfeatset,'feataccdict', 'pbsig', efeatset, model)
    elif model == 'SVM':
        RFtrain_test_1(px_sig, yall, sigfeatset,'feataccdict', 'pbsig', efeatset, model)
    else:
        RFtrain_test_1(px_sig, yall, sigfeatset,'feataccdict', 'pbsig', efeatset)
    
    #pvalue and correlation value
    if gnamec.shape[0] != 0 or gvalc.shape[0] != 0:
        correlationlist.append("Ethnic Groupa")
        px_corr = xall[correlationlist]
        #px_corr["Ethnic Groupa"] = dfethx
        #RFtrain_test_1(px_corr, yall, featset, 'feataccdict')
        if model == 'Logistic Regression':
            RFtrain_test_1(px_corr, yall, featset,'feataccdict','pbcorr',efeatset, model)
        elif model == 'SVM':
            RFtrain_test_1(px_corr, yall, featset,'feataccdict', 'pbcorr',efeatset, model)
        elif model == 'Random Forest':
            RFtrain_test_1(px_corr, yall, featset,'feataccdict', 'pbcorr',efeatset)
    else:
        print(featset, 'has no correlated features')
        #return
    
    #sns.set(rc={'figure.figsize':(100,50)})
    #sns.set(font_scale=4)
    #cbp = sns.barplot(x = gvalc, y = gnamec)
    

    return gvalc, gnamec, gvals, gnames

In [None]:
#Test PB all ethnicities
gvals,gnames, gvalc, gnamec = BestPB_1(Xall,yall, 'Correlated PBtest features with all ethnicities', 'Significant PBtest features with all ethnicities', Xall, yall, 'All ethnicities')
sns.set(rc={'figure.figsize':(100,80)})
sns.set(font_scale=4)
plt.title("Significant Features with All ethnicities")
sbp = sns.barplot(x = gvalc, y = gnamec)
display(dfalltandf)

In [None]:
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.title("Correlated Features with All ethnicities")
cbp = sns.barplot(x=gvals,y=gnames)

In [None]:
#Test PB white
gvals,gnames,gvalc, gnamec = BestPB_1(Xwhite, ywhite, 'Correlated PBtest features with white only','Significant PBtest features with white only', Xall, yall, 'White')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalc, y = gnamec)

In [None]:
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.title("Correlated Features with White only")
cbp = sns.barplot(x=gvals,y=gnames)

In [None]:
#Test PB black
gvals,gnames,gvalc, gnamec = BestPB_1(Xblack, yblack, 'Correlated PBtest features with black only', 'Significant PBtest features with black only', Xall, yall,'Black')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalc, y = gnamec)

In [None]:
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.title("Correlated Features with Black only")
cbp = sns.barplot(x=gvals,y=gnames)

In [None]:
#Test PB asian
gvals,gnames,gvalc, gnamec = BestPB_1(Xasian, yasian, 'Correlated PBtest features with asian only', 'Significant PBtest features with asian only', Xall, yall,'Asian')
if gnamec.shape[0] != 0 or gvalc.shape[0] != 0:
    sns.set(rc={'figure.figsize':(100,50)})
    sns.set(font_scale=4)
    cbp = sns.barplot(x = gvalc, y = gnamec)


In [None]:
if gnames.shape[0] != 0 or gvals.shape[0] != 0:
    sns.set(rc={'figure.figsize':(100,50)})
    sns.set(font_scale=4)
    plt.title("Correlated Features with Asian only")
    cbp = sns.barplot(x=gvals,y=gnames)

In [None]:
#Test PB hispanic
gvals,gnames,gvalc, gnamec = BestPB_1(Xhispanic, yhispanic, 'Correlated PBtest features with hispanic only', 'Significant PBtest features with hispanic only', Xall, yall,'Hispanic')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalc, y = gnamec)

In [None]:
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.title("Correlated Features with Hispanic only")
cbp = sns.barplot(x=gvals,y=gnames)

In [None]:
BestPB_1(Xall,yall, 'Correlated PBtest features with all ethnicities', 'Significant PBtest features with all ethnicities', Xall, yall,'All ethnicities','Logistic Regression')
BestPB_1(Xwhite, ywhite, 'Correlated PBtest features with white only','Significant PBtest features with white only', Xall, yall,'White','Logistic Regression')

In [None]:
BestPB_1(Xblack, yblack, 'Correlated PBtest features with black only', 'Significant PBtest features with black only', Xall, yall,'Black','Logistic Regression')
BestPB_1(Xasian, yasian, 'Correlated PBtest features with asian only', 'Significant PBtest features with asian only', Xall, yall,'Asian','Logistic Regression')
BestPB_1(Xhispanic, yhispanic, 'Correlated PBtest features with hispanic only', 'Significant PBtest features with hispanic only', Xall, yall,'Hispanic','Logistic Regression')

In [None]:
BestPB_1(Xall,yall, 'Correlated PBtest features with all ethnicities', 'Significant PBtest features with all ethnicities', Xall, yall,'All ethnicities','SVM')
BestPB_1(Xwhite, ywhite, 'Correlated PBtest features with white only','Significant PBtest features with white only', Xall, yall,'White','SVM')

In [None]:
BestPB_1(Xblack, yblack, 'Correlated PBtest features with black only', 'Significant PBtest features with black only', Xall, yall,'Black','SVM')
BestPB_1(Xasian, yasian, 'Correlated PBtest features with asian only', 'Significant PBtest features with asian only', Xall, yall,'Asian','SVM')
BestPB_1(Xhispanic, yhispanic, 'Correlated PBtest features with hispanic only', 'Significant PBtest features with hispanic only',Xall, yall,'Hispanic','SVM')

In [None]:
#pivot table
table = pd.pivot_table(dfalltandf, values='Value', index=['Feature Test'],
                    columns=['Features'], aggfunc=np.sum, fill_value=0)
display(table)

In [None]:
alleth = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feataccdict.items() ]))
corrdf = alleth[["Corr features all ethnicities accuracy","Corr features white only accuracy", "Corr features black only accuracy", "Corr features asian only accuracy", "Corr features hispanic only accuracy"]]
display(corrdf)
display(alleth)

fig = sns.boxplot(data=corrdf)
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.xlabel("All features")
plt.ylabel("Accuracy")
plt.title("Correlated Feature Accuracy")
plt.show(fig)

In [None]:
alleth = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feataccdict.items() ]))
sigdf = alleth[["Sig features all ethnicities accuracy","Sig features white only accuracy", "Sig features black only accuracy", "Sig features asian only accuracy", "Sig features hispanic only accuracy"]]
display(sigdf)
display(alleth)

fig = sns.boxplot(data=sigdf)
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.xlabel("All features")
plt.ylabel("Accuracy")
plt.title("Significant Feature Accuracy")
plt.show(fig)

In [None]:
#Ttest tests for mean between two variables
def BestTtest_1(dfethnic, px, py,featset, xall, yall, efeatset,model = 'Random Forest'):
    global dfalltandf
    
     #stores ethnic and drops it before fit
    dfeth = dfethnic["Ethnic Groupa"]
    dfethnic = dfethnic.drop(labels= ["Ethnic Groupa"], axis=1)
    
    Nugent0 = dfethnic[dfethnic['Nugent score'] == 0]
    #print(Nugent0)
    Nugent1 = dfethnic[dfethnic['Nugent score'] == 1]
    #print(Nugent1)


    impfeat =[]
    dfimpfeat = pd.DataFrame(columns = ['Feature Name', 'P-value'])
    gfeatname = []
    for column in Nugent0:
        Nugent0data = Nugent0[column]
        Nugent1data = Nugent1[column]
        tstat, pval = stats.ttest_ind(a = Nugent0data, b = Nugent1data, alternative="two-sided")
    
        alpha = 0.05
        if pval < alpha:
            impfeat.append(column)
            dfimpfeat = dfimpfeat.append({'Feature Name': column, 'P-value': pval}, ignore_index=True)
    dfimpfeat = dfimpfeat.sort_values(by = ['P-value'], ascending = True)
    dfimpfeat = dfimpfeat.loc[dfimpfeat["Feature Name"] != 'Nugent score']
    with pd.option_context('display.max_rows', None,
                           'display.max_columns', None,
                           'display.precision', 3,
                           ):
        display(dfimpfeat)

    impfeat.remove('Nugent score')
    

    #barplot of ttest
    gnamet = dfimpfeat['Feature Name']
    gvalt = dfimpfeat['P-value']
    
    dfttest = dfimpfeat[['Feature Name']].copy()
    
    dfttest.rename(columns={'Feature Name': 'Features'}, inplace=True)
    
    dfttesttemp = dfttest
    
    #display(dfftest)
    
    dfttesttemp['Feature Test'] = featset
    dfttesttemp['Value'] = 1
    
    dfalltandf = dfalltandf[dfalltandf['Feature Test'] != featset]
    
    dfalltandf = pd.concat([dfalltandf, dfttesttemp])
    
    dfimpfeat.to_csv(featsetdirectory+'/'+featset+".csv", index=False)

    #sns.set(rc={'figure.figsize':(100,50)})
    #sns.set(font_scale=4)
    #cbp = sns.barplot(x = gvalt, y = gnamet)
    
    impfeat.append("Ethnic Groupa")
    px_imp = xall[impfeat]

    #display(dfeth)
    #display(px_imp)
    #RFtrain_test_1(px_imp, yall, featset, 'feataccdict')

    if model == 'Logistic Regression':
        RFtrain_test_1(px_imp, yall, featset,'feataccdict', 'ttest', efeatset,model)
    elif model == 'SVM':
        RFtrain_test_1(px_imp, yall, featset,'feataccdict', 'ttest', efeatset,model)
    elif model == 'Random Forest':
        RFtrain_test_1(px_imp, yall, featset,'feataccdict', 'ttest',efeatset)
    
    return gvalt, gnamet

In [None]:
#Test Ttest all ethnicities
gvalt, gnamet = BestTtest_1(dfall,Xall,yall, 'Ttest features with all ethnicities',Xall, yall,'All ethnicities')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalt, y = gnamet)

In [None]:
#Test Ttest white only
gvalt, gnamet = BestTtest_1(dfwhite, Xwhite,ywhite, 'Ttest features with white only',Xall, yall, 'White')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalt, y = gnamet)

In [None]:
#Test Ttest black only
gvalt, gnamet =BestTtest_1(dfblack, Xblack,yblack, 'Ttest features with black only',Xall, yall, 'Black')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalt, y = gnamet)

In [None]:
#Test Ttest asian only
gvalt, gnamet =BestTtest_1(dfasian, Xasian,yasian, 'Ttest features with asian only',Xall, yall, 'Asian')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalt, y = gnamet)

In [None]:
#Test Ttest hispanic only
gvalt, gnamet = BestTtest_1(dfhispanic, Xhispanic,yhispanic, 'Ttest features with hispanic only',Xall, yall, 'Hispanic')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
cbp = sns.barplot(x = gvalt, y = gnamet)

In [None]:
BestTtest_1(dfall,Xall,yall, 'Ttest features with all ethnicities', Xall, yall, 'All ethnicities', 'Logistic Regression')
BestTtest_1(dfwhite, Xwhite,ywhite, 'Ttest features with white only', Xall, yall, 'White','Logistic Regression')
BestTtest_1(dfblack, Xblack,yblack, 'Ttest features with black only', Xall, yall, 'Black','Logistic Regression')

In [None]:
BestTtest_1(dfasian, Xasian,yasian, 'Ttest features with asian only', Xall, yall, 'Asian','Logistic Regression')
BestTtest_1(dfhispanic, Xhispanic,yhispanic, 'Ttest features with hispanic only', Xall, yall, 'Hispanic','Logistic Regression')

In [None]:
BestTtest_1(dfall,Xall,yall, 'Ttest features with all ethnicities', Xall, yall, 'All ethnicities','SVM')
BestTtest_1(dfwhite, Xwhite,ywhite, 'Ttest features with white only', Xall, yall, 'White','SVM')
BestTtest_1(dfblack, Xblack,yblack, 'Ttest features with black only', Xall, yall, 'Black','SVM')

In [None]:
BestTtest_1(dfasian, Xasian,yasian, 'Ttest features with asian only', Xall, yall, 'Asian','SVM')
BestTtest_1(dfhispanic, Xhispanic,yhispanic, 'Ttest features with hispanic only', Xall, yall, 'Hispanic','SVM')

In [None]:
#pivot table
table = pd.pivot_table(dfalltandf, values='Value', index=['Feature Test'],
                    columns=['Features'], aggfunc=np.sum, fill_value=0)
display(table)

In [None]:
alleth = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feataccdict.items() ]))
ttestdf = alleth[["Ttest features all ethnicities accuracy","Ttest features white only accuracy", "Ttest features black only accuracy", "Ttest features asian only accuracy", "Ttest features hispanic only accuracy"]]
display(ttestdf)
display(alleth)

fig = sns.boxplot(data=ttestdf)
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.xlabel("All features")
plt.ylabel("Accuracy")
plt.title("Ttest Feature Accuracy")
plt.show(fig)

In [None]:

def BestGini_1(px,py,featset,xall,yall,efeatset,model = 'Random Forest'):
    from sklearn.tree import DecisionTreeClassifier
    global dfalltandf
    
    #calculates gini gain (higher gain more important feature)
    clf = DecisionTreeClassifier(criterion='gini')
    
    #stores ethnic and drops it before fit
    dfethx = px["Ethnic Groupa"]
    px = px.drop(labels= ["Ethnic Groupa"], axis=1)
    
    # Fit the decision tree classifier
    clf = clf.fit(px, py)

    # Feature importances based on reduction in Gini impurity each feature gives when splitting nodes!!
    feature_importances = clf.feature_importances_
    #print(feature_importances)
 
    # Sort the feature importances from greatest to least using the sorted indices
    sorted_indices = feature_importances.argsort()[::-1]

    #array of names sorted accoridng to index of feature importance
    sorted_feature_names = px.columns[sorted_indices]
    
    sorted_importances = feature_importances[sorted_indices]

    new_si = np.delete(sorted_importances, np.where(sorted_importances == 0))
    new_si_length = len(new_si)
    
    sfn_list = sorted_feature_names.tolist()
    sfn_list = sfn_list[0:new_si_length]

    #get list of only important features from gini
    giniimplist = []
    for i in range(len(sorted_importances)): 
        if sorted_importances[i] > 0:
            giniimplist.append(sorted_feature_names[i])

    for i in range(len(sorted_feature_names)):
        print(sorted_feature_names[i].ljust(30)+":"+str(sorted_importances[i]))

    print
    
    # Create a bar plot of the feature importances
    #sns.set(rc={'figure.figsize':(100,50)})
    #sns.set(font_scale=4)
    #bp = sns.barplot(x = new_si, y = sfn_list)
    
    
    dfgini = pd.DataFrame (sfn_list, columns = ['Feature Name'])
    
    
    dfgini.rename(columns={'Feature Name': 'Features'}, inplace=True)
    #print(dfgini)
    
    dfginitemp = dfgini
    
    #display(dfftest)
    
    dfginitemp['Feature Test'] = featset
    dfginitemp['Value'] = 1
    
    dfalltandf = dfalltandf[dfalltandf['Feature Test'] != featset]
    
    dfalltandf = pd.concat([dfalltandf, dfginitemp])

    silist = new_si.tolist()
    dfginicsv = pd.DataFrame(sfn_list, columns = ['Feature Name'])
    dfginicsv.insert(1,'Gini score', silist)
    #print(dfginicsv)
    dfginicsv.to_csv(featsetdirectory+'/'+featset+".csv", index=False)
    
    giniimplist.append("Ethnic Groupa")
    px_impg = xall[giniimplist]
    #px_impg["Ethnic Groupa"] = dfethx
    #RFtrain_test_1(px_impg, yall, featset,'feataccdict')

    print(giniimplist)

    if model == 'Logistic Regression':
        RFtrain_test_1(px_impg, yall, featset,'feataccdict', 'gini', efeatset,model)
    elif model == 'SVM':
        RFtrain_test_1(px_impg, yall, featset,'feataccdict', 'gini', efeatset,model)
    elif model == 'Random Forest':
        RFtrain_test_1(px_impg, yall, featset,'feataccdict', 'gini',efeatset)
    
    return new_si, sfn_list
    

In [None]:
#Test Gini all ethnicities
new_si, sfn_list = BestGini_1(Xall,yall, 'Gini features with all ethnicities', Xall, yall,'All ethnicities')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
bp = sns.barplot(x = new_si, y = sfn_list)

In [None]:
#Test Gini white only
new_si, sfn_list =BestGini_1(Xwhite,ywhite, 'Gini features with only white', Xall, yall,'White')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
bp = sns.barplot(x = new_si, y = sfn_list)

In [None]:
#Test Gini black only
new_si, sfn_list =BestGini_1(Xblack,yblack, 'Gini features with only black', Xall, yall,'Black')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
bp = sns.barplot(x = new_si, y = sfn_list)

In [None]:
#Test Gini asian only
new_si, sfn_list =BestGini_1(Xasian,yasian, 'Gini features with only asian', Xall, yall,'Asian')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
bp = sns.barplot(x = new_si, y = sfn_list)

In [None]:
#Test Gini hispanic only
new_si, sfn_list =BestGini_1(Xhispanic, yhispanic, 'Gini features with only hispanic', Xall, yall,'Hispanic')
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
bp = sns.barplot(x = new_si, y = sfn_list)

In [None]:
alleth = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feataccdict.items() ]))
ginidf = alleth[["Gini features all ethnicities accuracy","Gini features white only accuracy", "Gini features black only accuracy", "Gini features asian only accuracy", "Gini features hispanic only accuracy"]]
display(ginidf)
display(alleth)

fig = sns.boxplot(data=ginidf)
sns.set(rc={'figure.figsize':(100,50)})
sns.set(font_scale=4)
plt.xlabel("All features")
plt.ylabel("Accuracy")
plt.title("Gini Feature Accuracy")
plt.show(fig)

In [None]:
BestGini_1(Xall,yall, 'Gini features with all ethnicities', Xall, yall, 'All ethnicities','Logistic Regression')
BestGini_1(Xwhite,ywhite, 'Gini features with only white', Xall, yall, 'White','Logistic Regression')
BestGini_1(Xblack,yblack, 'Gini features with only black', Xall, yall, 'Black','Logistic Regression')

In [None]:
BestGini_1(Xasian,yasian, 'Gini features with only asian', Xall, yall, 'Asian','Logistic Regression')
BestGini_1(Xhispanic, yhispanic, 'Gini features with only hispanic', Xall, yall, 'Hispanic','Logistic Regression')

In [None]:
BestGini_1(Xall,yall, 'Gini features with all ethnicities', Xall, yall, 'All ethnicities','SVM')
BestGini_1(Xwhite,ywhite, 'Gini features with only white', Xall, yall, 'White','SVM')
BestGini_1(Xblack,yblack, 'Gini features with only black', Xall, yall, 'Black','SVM')

In [None]:
BestGini_1(Xasian,yasian, 'Gini features with only asian', Xall, yall, 'Asian','SVM')
BestGini_1(Xhispanic, yhispanic, 'Gini features with only hispanic', Xall, yall, 'Hispanic','SVM')

In [None]:
#pivot table
table = pd.pivot_table(dfalltandf, values='Value', index=['Feature Test'],
                    columns=['Features'], aggfunc=np.sum, fill_value=0)
display(table)
table.to_csv('Allfeatures_AllTests.csv')


In [None]:
my_palette = sns.color_palette("light:b", as_cmap=True)

In [None]:
#combines all ftest features and ethnicities to make graph

filelist = ['Feature_Sets/Ftest features with all ethnicities.csv', 'Feature_Sets/Ftest features with only asian.csv', 
            'Feature_Sets/Ftest features with only black.csv', 'Feature_Sets/Ftest features with only white.csv', 
            'Feature_Sets/Ftest features with only hispanic.csv']
ethlist = ['All Ethnicities', 'Asian', 'Black', 'White', 'Hispanic']
topnum = 50
counter = 0
combined_df = pd.DataFrame()

for file in filelist:
    dfread = pd.read_csv(file)
    dfread['Ethnicity'] = ethlist[counter]
    sorted_df = dfread.sort_values('F-score', ascending=False).head(topnum)
    counter = counter+1
    #display(sorted_df)
    combined_df = combined_df.append(sorted_df, ignore_index=True)

display(combined_df)

pivotftest = pd.pivot_table(combined_df, values='F-score', index=['Feature Name'],
                    columns=['Ethnicity'], aggfunc=np.sum, fill_value=0)
#pivotftest = pivotftest.sort_values('Feature Name', key=lambda x: x.str.lower())
display(pivotftest)

plt.figure(figsize=(15, 15))
ax = sns.heatmap(pivotftest,fmt='.0f', annot_kws={'fontsize': 8}, cmap = my_palette)
ax.set_yticklabels(labels=ax.get_yticklabels(), va='center')
plt.title('F-Test Feature Importance', fontsize=15)
plt.xlabel('Ethnicity', fontsize=15)
plt.ylabel('Feature Name', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=10)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=15)
unique_values = combined_df['Feature Name'].unique()
print(len(unique_values))


In [None]:
filelist = ['Feature_Sets/Correlated PBtest features with all ethnicities.csv', 'Feature_Sets/Correlated PBtest features with asian only.csv', 
            'Feature_Sets/Correlated PBtest features with black only.csv', 'Feature_Sets/Correlated PBtest features with white only.csv', 
            'Feature_Sets/Correlated PBtest features with hispanic only.csv']
ethlist = ['All Ethnicities', 'Asian', 'Black', 'White', 'Hispanic']
topnum = 50
counter = 0
combined_df = pd.DataFrame()

for file in filelist:
    dfread = pd.read_csv(file)
    dfread['Ethnicity'] = ethlist[counter]
    sorted_df = dfread.sort_values('Correlation Coefficient', ascending=False).head(topnum)
    counter = counter+1
    #display(sorted_df)
    combined_df = combined_df.append(sorted_df, ignore_index=True)

display(combined_df)

pivotpbcorr = pd.pivot_table(combined_df, values='Correlation Coefficient', index=['Feature Name'],
                    columns=['Ethnicity'], aggfunc=np.sum, fill_value=0)
#pivotpbcorr = pivotpbcorr.sort_values('Feature Name', key=lambda x: x.str.lower())
display(pivotpbcorr)

plt.figure(figsize=(15, 15))
ax = sns.heatmap(pivotpbcorr, fmt='.2f', annot_kws={'fontsize': 8}, cmap = my_palette)
ax.set_yticklabels(labels=ax.get_yticklabels(), va='center')
ax.set_yticklabels(labels=ax.get_yticklabels(), va='center')
plt.title('Correlated Point Biserial Feature Importance', fontsize=15)
plt.xlabel('Ethnicity', fontsize=15)
plt.ylabel('Feature Name', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=10)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=15)

In [None]:
filelist = ['Feature_Sets/Significant PBtest features with all ethnicities.csv', 'Feature_Sets/Significant PBtest features with asian only.csv', 
            'Feature_Sets/Significant PBtest features with black only.csv', 'Feature_Sets/Significant PBtest features with white only.csv', 
            'Feature_Sets/Significant PBtest features with hispanic only.csv']
ethlist = ['All Ethnicities', 'Asian', 'Black', 'White', 'Hispanic']
topnum = 50
counter = 0
combined_df = pd.DataFrame()

for file in filelist:
    dfread = pd.read_csv(file)
    dfread['Ethnicity'] = ethlist[counter]
    sorted_df = dfread.sort_values('P-value', ascending=True).head(topnum)
    counter = counter+1
    #display(sorted_df)
    combined_df = combined_df.append(sorted_df, ignore_index=True)

display(combined_df)

pivotpbsig = pd.pivot_table(combined_df, values='P-value', index=['Feature Name'],
                    columns=['Ethnicity'], aggfunc=np.sum, fill_value=0)
#pivotpbsig = pivotpbsig.sort_values('Feature Name', key=lambda x: x.str.lower())
pivotpbsig = pivotpbsig.applymap(lambda x: -np.log2(x) if x > 0 else 0)
display(pivotpbsig)

plt.figure(figsize=(15, 15))
ax = sns.heatmap(pivotpbsig, fmt='.2f', annot_kws={'fontsize': 8}, cmap = my_palette)
ax.set_yticklabels(labels=ax.get_yticklabels(), va='center')
plt.title('Significant Point Biserial Feature Importance', fontsize=15)
plt.xlabel('Ethnicity', fontsize=15)
plt.ylabel('Feature Name', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=10)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=15)

In [None]:
filelist = ['Feature_Sets/Ttest features with all ethnicities.csv', 'Feature_Sets/Ttest features with asian only.csv', 
            'Feature_Sets/Ttest features with black only.csv', 'Feature_Sets/Ttest features with white only.csv', 
            'Feature_Sets/Ttest features with hispanic only.csv']
ethlist = ['All Ethnicities', 'Asian', 'Black', 'White', 'Hispanic']
topnum = 20
counter = 0
combined_df = pd.DataFrame()

for file in filelist:
    dfread = pd.read_csv(file)
    dfread['Ethnicity'] = ethlist[counter]
    sorted_df = dfread.sort_values('P-value', ascending=True).head(topnum)
    counter = counter+1
    #display(sorted_df)
    combined_df = combined_df.append(sorted_df, ignore_index=True)

display(combined_df)

pivotttest = pd.pivot_table(combined_df, values='P-value', index=['Feature Name'],
                    columns=['Ethnicity'], aggfunc=np.sum, fill_value=0)
#pivotttest = pivotttest.sort_values('Feature Name', key=lambda x: x.str.lower())
pivotttest = pivotttest.applymap(lambda x: -np.log2(x) if x > 0 else 0)
display(pivotttest)

plt.figure(figsize=(15, 15))
ax = sns.heatmap(pivotttest, fmt='.2f', annot_kws={'fontsize': 8}, cmap = my_palette)
ax.set_yticklabels(labels=ax.get_yticklabels(), va='center')
plt.title('T-Test Feature Importance', fontsize=15)
plt.xlabel('Ethnicity', fontsize=15)
plt.ylabel('Feature Name', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=10)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=15)

In [None]:
filelist = ['Feature_Sets/Gini features with all ethnicities.csv', 'Feature_Sets/Gini features with only asian.csv', 
            'Feature_Sets/Gini features with only black.csv', 'Feature_Sets/Gini features with only white.csv', 
            'Feature_Sets/Gini features with only hispanic.csv']
ethlist = ['All Ethnicities', 'Asian', 'Black', 'White', 'Hispanic']
topnum = 50
counter = 0
combined_df = pd.DataFrame()

for file in filelist:
    dfread = pd.read_csv(file)
    dfread['Ethnicity'] = ethlist[counter]
    sorted_df = dfread.sort_values('Gini score', ascending=False).head(topnum)
    counter = counter+1
    #display(sorted_df)
    combined_df = combined_df.append(sorted_df, ignore_index=True)

display(combined_df)

pivotgini = pd.pivot_table(combined_df, values='Gini score', index=['Feature Name'],
                    columns=['Ethnicity'], aggfunc=np.sum, fill_value=0)
#pivotgini = pivotgini.sort_values('Feature Name', key=lambda x: x.str.lower())
display(pivotgini)

plt.figure(figsize=(15, 15))
ax = sns.heatmap(pivotgini, fmt='.2f', annot_kws={'fontsize': 8}, cmap = my_palette)
ax.set_yticklabels(labels=ax.get_yticklabels(), va='center')
plt.title('Gini Impurity Feature Importance', fontsize=15)
plt.xlabel('Ethnicity', fontsize=15)
plt.ylabel('Feature Name', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=10)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=15)

In [None]:
def plot_PR_curve(outdf, save_to_filename, testname):
    #ytrue = np.concatenate(y_true)
    #proba = np.concatenate(y_proba)
    fig, ax = plt.subplots()

    groups = outdf.groupby('Race') 

    i = 0
    for name, grp in groups:
        #print(name)
        #display(grp)
        y_true = grp['y_true']
        y_proba = grp['probs']
        colors = ['blue' ,'green', 'orange', 'red','yellow']
        
        ap_score = average_precision_score(y_true, y_proba)
        
        mean_recall = np.linspace(0,1,100)
        precision,recall, _ = precision_recall_curve(y_true, y_proba)
        precision = np.interp(mean_recall,precision,recall)
        
        label = '%s AP=%.4f' % (str(name), ap_score)
                                                    
        ax.plot(
                mean_recall,
                precision,
                label=label,
                lw=1,
                color=colors[i])   
        i = i+1
        
    ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel='Recall',
    ylabel='Precision',
    title=testname)  
        
    #ax.set_xlabel('Recall', fontsize=10)
    #ax.set_ylabel('Precision', fontsize=10)
    #ax.set_title(testname, fontsize=15, pad=10)
    ax.legend(loc="lower right", fontsize=10)

    plt.style.use('default')
    ax.legend(loc="lower right")
    plt.savefig(save_to_filename)
    plt.show()

In [None]:
#precision recall curve
#from EXPERIMENTS_WITH_PROBS import plot_PR_curve
filelist = [proballfeat, probftest, probttest, probsigpb, probcorrpb, probgini]
testname = ['All Features Precision-recall curve', 'F-Test Precision-recall curve',
            'T-Test Precision-recall curve', 'Signficant Point Biserial Precision-recall curve', 
            'Correlated Point Biserial Precision-recall curve', 'Gini Impurity Precision-recall curve']

x = 0 
for file in filelist:
    dfread = pd.read_csv(file)
    #display(dfread)
    #display(dfread['y_true'])
    #display(dfread['probs'])
    #ytrue = np.concatenate([dfread['y_true']])
    #proba = np.concatenate([dfread['probs']])
    newfile = file.replace("csv", "png")
    plot_PR_curve(dfread, prcurvedirectory+'/PRcurve_'+newfile, testname[x])
    x = x+1