In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from math import log2, log
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, fbeta_score

In [2]:
og_df = pd.read_csv("Standardized_Cleaned_Data.csv")
og_df.drop(columns=["Unnamed: 0"], inplace=True)

In [77]:
def listToString(s):
    '''
    s: any list
    
    output: str built from the elements of the given list
    '''
    str1 = ""
    for ele in s:
        str1 += (str(ele) + " ")
    return str1[:-1]

def gradient_descent_logistic_with_penalty(X, y, learning_rate_val, threshold, L):
    '''
    X: np.array of our training data
    y: np.array of our testing data
    learning_rate_val: learning rate
    threshold: highest value our gradient can be before we take the betas as they are
    L: penalty
    
    returns np.matrix of our betas
    '''
    betas = np.matrix(np.zeros(X.shape[1]))
    learning_rate = np.array([-1 * learning_rate_val] * betas.shape[1])
    stop = False
    iterations = 0
    while not stop:
        y_hat = np.array(X * betas.transpose()).flatten()
        exp = np.exp(y_hat)
        pi = (exp / (1 + exp))
        errors = np.matrix(y-pi)
        gradient = np.array((errors * X) / X.shape[0])[0]
        beta_vals = np.array(betas)[0]
        for x in range(len(gradient)):
            if x < 0:
                gradient[x] -= beta_vals[x] * L
            elif x > 0:
                gradient[x] += beta_vals[x] * L
        gradient_total = np.sum(abs(gradient))
        if (gradient_total < threshold) or (iterations > 6000):
            stop = True
        betas = -1*(gradient * learning_rate) + betas
        iterations += 1
#         if iterations % 100 == 0:
#             print(gradient_total)
    return betas

def new_betas_to_preds(X, new_betas, threshold):
    '''
    X: data minus the target variable
    new_betas: betas found in gradient_descent_logistic_with_penalty
    threshold: what probability value we want to use to differentiate between ones and zeroes in our predictions
    
    output: predictions and probabilities
    '''
    log_odds = np.matrix(X) * new_betas.transpose()
    exp = np.exp(log_odds)
    pi = (exp / (1 + exp))
    preds = pd.Series([1 if x > threshold else 0 for x in pi])
    return preds, pi

def my_split(X, y, test_size, split):
    '''
    X: data minus the target variable
    y: the target variable
    test_size: the proportion of data we want in the test set
    split: "random" or "stratify" for the type of split we want
    
    output: X_train, y_train, X_validation, y_validation, X_test, y_test for use in model building and testing
    '''
    if split == "random":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state = 42)
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = .25, random_state = 42)
        return X_train, y_train, X_validation, y_validation, X_test, y_test
    if split == "stratify":
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state = 42)
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, stratify=y_train, test_size=test_size, random_state = 42)
        return X_train, y_train, X_validation, y_validation, X_test, y_test

def get_metrics(y_test, preds, pi):
    '''
    y_test: actual target var that we're comparing our predictions to
    preds: predictions of the target var
    pi: probabilities to be used in ROC-AUC calculation
    
    output: several metrics describing the validity of our model
    '''
    acc = accuracy_score(y_test.to_list(), preds)
    prec = precision_score(y_test.to_list(), preds)
    rec = recall_score(y_test.to_list(), preds)
    f1 = f1_score(y_test.to_list(), preds)
    f2 = fbeta_score(y_test.to_list(), preds, beta = 2)
    roc = roc_auc_score(y_test.to_list(), pi)
    our_metric = acc * prec * rec
    return {"ROC-AUC":roc, "F1-Score":f1, "F2-Score":f2, "Precision":prec, "Recall":rec, "Accuracy":acc, "Our Metric":our_metric}

def my_downsample(df, zeroes_to_ones):
    '''
    df: data with target
    zeroes_to_ones: how many zeroes do we want for every one in the data
    
    output: original data but only with all of the 1s and randomly sampled zeroes
    '''
    just_ones_df = df[df["TARGET"] == 1]
    just_zeroes_df =df[df["TARGET"] == 0].sample(n=len(just_ones_df) * zeroes_to_ones, random_state=42)
    complete_df = pd.concat([just_ones_df, just_zeroes_df])
    return complete_df

def extreme_zeroes(df, zeroes_to_ones):
    '''
    df: data with target
    zeroes_to_ones: how many zeroes do we want for every one in the data
    
    output: original data but only with all of the 1s and only the most extreme zeroes
    '''
    X = df.drop(columns=["TARGET"])
    y = df["TARGET"]
    
    #Train a basic logistic regression model with lr=.1, cutoff=.3, penalty=0
    new_betas = gradient_descent_logistic_with_penalty(np.array(X), y, .1, .03, 0)
    preds, pi = new_betas_to_preds(X, new_betas, .5)
    #Collect probabilities
    X["probs"] = pi
    
    #Downsample to only include the most extreme n (based on ds_comp) zeroes and all the ones
    just_ones_df = df[df["TARGET"] == 1]
    ez_indices = X["probs"].sort_values().index[:len(just_ones_df) * zeroes_to_ones].to_list()
    just_zeroes_df = df[df.index.isin(ez_indices)]
    complete_df = pd.concat([just_ones_df, just_zeroes_df])
    return complete_df

def build_model(downsample, zeroes_to_ones, split, test_size, PCA_, PCA_var, threshold, df):
    '''
    downsample: "random", "ez", or None to determine how we downsample the data
    zeroes_to_ones: number of zeroes we want for every one in the target variable
    split: "random" or "stratified" to describe how we split our train and test sets
    test_size: what portion of the data we want in our test set
    PCA_: 1 if we want to apply pca and 0 otherwise
    PCA_var: what percentage of the variance do we want explained in our PCA
    threshold: what probability threshold determines a 1 from a 0 in our predicted values
    df: original data
    
    output: metrics of success for our built model
    '''
    
    X = df.drop(columns=["TARGET"])
    y = df["TARGET"]
    
    if PCA_:
        #initialize PCA
        pca = PCA(n_components=PCA_var)
        #new X with PCA
        X = pd.DataFrame(pca.fit_transform(np.array(X)))
        #new df with PCA X and initial target
        df = pd.concat([X, y], axis = 1)
        
    ult_X_train, ult_y_train, ult_X_validation, ult_y_validation, ult_X_test, ult_y_test = my_split(X, y, test_size, split)
    
    if downsample == "ez":
        df = extreme_zeroes(df, zeroes_to_ones)
    elif downsample == "random":
        df = my_downsample(df, zeroes_to_ones)
        
    X = df.drop(columns=["TARGET"])
    y = df["TARGET"]
        
    X_train, y_train, X_validation, y_validation, X_test, y_test = my_split(X, y, test_size, split)
    
    X_train_matrix, y_train_matrix = [np.matrix(X_train), np.matrix(y_train)]
    new_betas = gradient_descent_logistic_with_penalty(X_train_matrix, y_train, .1, .03, .001)
    preds, pi = new_betas_to_preds(ult_X_validation, new_betas, threshold)
    
    return get_metrics(ult_y_validation, preds, pi)

In [51]:
downsample = ["None", "random", "extreme zeroes"]
ds_comp = [1, 2, 3]
split = ["stratify", "random"]
test_size = [.2, .1]
PCA_ = [0, 1]
PCA_var = [.8, .9, .95]
threshold = [.33, .4, .5, .66]

#Order: downsample, ds_comp, split, ratio, threshold

combinations = []
for a in downsample:
    for b in ds_comp:
        for c in split:
            for d in test_size:
                for e in PCA_:
                    for f in PCA_var:
                        for g in threshold:
                            combinations.append([a, b, c, d, e, f, g])
                            
new_combinations = []
for comb in combinations:
    new_comb = comb
    if not comb[4]:
        new_comb[5] = "x"
    if comb[0] == "None":
        ds_comp = "x"
    new_combinations.append(new_comb)
    
combinations = []
[combinations.append(x) for x in new_combinations if x not in combinations]
len(combinations)

576

In [86]:
from tqdm import tqdm

results = {}
for c in tqdm(combinations):
    metrics = build_model(c[0], c[1], c[2], c[3], c[4], c[5], c[6], og_df)
    results[listToString(c)] = metrics
    
logres_models = pd.DataFrame(results).T
logres_models = logres_models[logres_models["ROC-AUC"] != "x"]
logres_models.sort_values(by=["ROC-AUC", "F1-Score"], ascending = False).to_csv("final_logres_results.csv")

100%|██████████████████████████████████████████████████████████████████████████████| 576/576 [2:27:20<00:00, 15.35s/it]
  result = method(y)


In [87]:
logres_models.sort_values(by=["ROC-AUC", "F1-Score"], ascending = False)

Unnamed: 0,ROC-AUC,F1-Score,F2-Score,Precision,Recall,Accuracy,Our Metric
random 1 stratify 0.1 0 x 0.66,0.745950,0.284906,0.346808,0.219583,0.405551,0.835670,0.074418
random 1 stratify 0.1 0 x 0.5,0.745950,0.255564,0.407689,0.157571,0.675918,0.682143,0.072652
random 1 stratify 0.1 0 x 0.4,0.745950,0.227750,0.403342,0.131985,0.829902,0.545707,0.059774
random 1 stratify 0.1 0 x 0.33,0.745950,0.202252,0.377796,0.113982,0.896598,0.429072,0.043849
random 2 stratify 0.1 0 x 0.5,0.745524,0.288704,0.344411,0.227401,0.395255,0.842788,0.075751
...,...,...,...,...,...,...,...
extreme zeroes 3 random 0.2 1 0.8 0.66,0.711492,0.020448,0.013096,0.317365,0.010564,0.917434,0.003076
random 3 random 0.2 1 0.8 0.66,0.711143,0.234861,0.227375,0.248498,0.222643,0.881662,0.048779
random 3 random 0.2 1 0.8 0.5,0.711143,0.219181,0.381241,0.128291,0.751844,0.563022,0.054306
random 3 random 0.2 1 0.8 0.4,0.711143,0.170944,0.336466,0.093931,0.949173,0.248968,0.022197


## Error Analysis for Best Model

In [90]:
downsample = "random"
zeroes_to_ones = 1
split = "stratify"
test_size = .1
threshold = .66
df = og_df.copy()

X = df.drop(columns=["TARGET"])
y = df["TARGET"]

ult_X_train, ult_y_train, ult_X_validation, ult_y_validation, ult_X_test, ult_y_test = my_split(X, y, test_size, split)

if downsample == "ez":
    df = extreme_zeroes(df, zeroes_to_ones)
elif downsample == "random":
    df = my_downsample(df, zeroes_to_ones)

X = df.drop(columns=["TARGET"])
y = df["TARGET"]

X_train, y_train, X_validation, y_validation, X_test, y_test = my_split(X, y, test_size, split)

X_train_matrix, y_train_matrix = [np.matrix(X_train), np.matrix(y_train)]
new_betas = gradient_descent_logistic_with_penalty(X_train_matrix, y_train, .1, .03, .001)
preds, pi = new_betas_to_preds(ult_X_validation, new_betas, threshold)

In [127]:
preds.index = ult_y_validation.index
pred_vs_true = pd.concat([pd.DataFrame({"preds":preds}), 
           pd.DataFrame({"actual":ult_y_validation})], axis=1)

errors = pred_vs_true[pred_vs_true["preds"] != pred_vs_true["actual"]]
false_positives = pred_vs_true[pred_vs_true["preds"] > pred_vs_true["actual"]].index.to_list()
false_negatives = pred_vs_true[pred_vs_true["preds"] < pred_vs_true["actual"]].index.to_list()

print("false positives:", len(false_positives))
print("false negatives:", len(false_negatives))

fp_df = og_df[og_df.index.isin(false_positives)]
fn_df = og_df[og_df.index.isin(false_negatives)]

false positives: 3220
false negatives: 1328


In [136]:
fp_df.describe().T.sort_values(by="std", ascending=False).tail(10)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
FLAG_MOBIL,3220.0,0.001803,2.168741e-19,0.001803,0.001803,0.001803,0.001803,0.001803
CODE_GENDER_XNA,3220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_12,3220.0,-0.00255,0.0,-0.00255,-0.00255,-0.00255,-0.00255,-0.00255
NAME_INCOME_TYPE_Student,3220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NAME_INCOME_TYPE_Unemployed,3220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NAME_INCOME_TYPE_Businessman,3220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FLAG_DOCUMENT_7,3220.0,-0.013853,0.0,-0.013853,-0.013853,-0.013853,-0.013853,-0.013853
NAME_FAMILY_STATUS_Unknown,3220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NAME_INCOME_TYPE_Maternity leave,3220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TARGET,3220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
fp_df.describe() - fn_df.describe()

Unnamed: 0,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan
count,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,...,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0,1892.0
mean,-1.0,0.014069,-0.015524,-0.175163,-0.081956,-0.181812,-0.138584,0.504259,-0.26441,0.195026,...,-0.004067,-0.003408,-0.003097,-0.04663,-0.039445,0.009521,0.090538,-0.096669,0.00476,0.091909
std,0.0,0.010167,-0.045681,-0.077565,0.004325,-0.093677,-0.161133,-0.056266,-0.345417,-0.122497,...,-0.024338,-0.029305,-0.025856,-0.041153,-0.029712,0.031731,-0.012146,-0.009101,0.02281,-0.006665
min,-1.0,0.0,-0.018978,-0.011807,-0.117982,0.0,-0.029282,0.033456,0.007807,0.514068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-1.0,0.0,0.0,-0.080354,-0.055886,-0.097443,-0.029065,0.649121,0.001304,0.386048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-1.0,0.0,0.018978,-0.178864,-0.097956,0.0,-0.012074,0.599223,-0.000237,0.220842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0
75%,-1.0,0.0,0.0,-0.287783,-0.110764,-0.085263,-0.128839,0.479207,-0.001348,0.047759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,-1.0,-5.539244,3.795502,-0.048881,2.627587,-0.694283,0.0,0.004354,0.0,0.001703,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
