In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from math import log2, log
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, fbeta_score

In [3]:
og_df = pd.read_csv("Standardized_Cleaned_Data.csv")
og_df.drop(columns=["Unnamed: 0"], inplace=True)

In [9]:
def listToString(s):
    '''
    s: any list
    
    output: str built from the elements of the given list
    '''
    str1 = ""
    for ele in s:
        str1 += (str(ele) + " ")
    return str1[:-1]

def gradient_descent_logistic_with_penalty(X, y, learning_rate_val, threshold, L):
    '''
    X: np.array of our training data
    y: np.array of our testing data
    learning_rate_val: learning rate
    threshold: highest value our gradient can be before we take the betas as they are
    L: penalty
    
    returns np.matrix of our betas
    '''
    betas = np.matrix(np.zeros(X.shape[1]))
    learning_rate = np.array([-1 * learning_rate_val] * betas.shape[1])
    stop = False
    iterations = 0
    while not stop:
        y_hat = np.array(X * betas.transpose()).flatten()
        exp = np.exp(y_hat)
        pi = (exp / (1 + exp))
        errors = np.matrix(y-pi)
        gradient = np.array((errors * X) / X.shape[0])[0]
        beta_vals = np.array(betas)[0]
        for x in range(len(gradient)):
            if x < 0:
                gradient[x] -= beta_vals[x] * L
            elif x > 0:
                gradient[x] += beta_vals[x] * L
        gradient_total = np.sum(abs(gradient))
        if (gradient_total < threshold) or (iterations > 6000):
            stop = True
        betas = -1*(gradient * learning_rate) + betas
        iterations += 1
#         if iterations % 100 == 0:
#             print(gradient_total)
    return betas

def lda_new(x, y):
    '''
    x: data minus the target variable
    y: target variable
    
    output: w and c
    '''
    x["Target"] = y
    ones = x[x["Target"] == 1].drop(columns=["Target"])
    zeros = x[x["Target"] == 0].drop(columns=["Target"])

    sigma_ones = np.cov(np.matrix(ones).transpose()) * (ones.shape[0] - 1)
    sigma_zeros = np.cov(np.matrix(zeros).transpose()) * (zeros.shape[0] - 1)
    pooled_cov = (sigma_ones + sigma_zeros) / (ones.shape[0] + zeros.shape[0] - 2)
    pooled_cov_inv = np.matrix(np.linalg.inv(pooled_cov))

    mean1 = np.matrix(np.mean(ones, axis = 0))
    mean2 = np.matrix(np.mean(zeros, axis = 0))
    mean_diff = (mean1 - mean2).transpose()
    mean_sum = (mean1 + mean2).transpose()
    #print(mean_diff)

    w = pooled_cov_inv * mean_diff
    p1 = len(ones)/len(x)
    p0 = len(zeros)/len(x)
    log_prob = log(p1/p0)

    c = ((.5 * mean_sum.transpose()) * w)
    #c = log_prob - ((.5 * mean_sum.transpose()) * w)

    return w, c

def interpret_lda(w, c, X_validation):
    '''
    w: from lda_new
    c: from lda_new
    X_validation: test set of data
    
    output: predictions and probabilities
    '''
    nums = np.array((np.matrix(X_validation) * w)).flatten()
    predictions = []
    pi = []
    for x in nums:
        pi.append(x)
        if x > -1 * c[0]:
            predictions.append(1)
        else:
            predictions.append(0)
    abs_pi = (pi - min(pi)) / ( max(pi) - min(pi) )
    return predictions, abs_pi

def new_betas_to_preds(X, new_betas, threshold):
    '''
    X: data minus the target variable
    new_betas: betas found in gradient_descent_logistic_with_penalty
    threshold: what probability value we want to use to differentiate between ones and zeroes in our predictions
    
    output: predictions and probabilities
    '''
    log_odds = np.matrix(X) * new_betas.transpose()
    exp = np.exp(log_odds)
    pi = (exp / (1 + exp))
    preds = pd.Series([1 if x > threshold else 0 for x in pi])
    return preds, pi

def my_split(X, y, test_size, split):
    '''
    X: data minus the target variable
    y: the target variable
    test_size: the proportion of data we want in the test set
    split: "random" or "stratify" for the type of split we want
    
    output: X_train, y_train, X_validation, y_validation, X_test, y_test for use in model building and testing
    '''
    if split == "random":
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state = 42)
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = .25, random_state = 42)
        return X_train, y_train, X_validation, y_validation, X_test, y_test
    if split == "stratify":
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state = 42)
        X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, stratify=y_train, test_size=test_size, random_state = 42)
        return X_train, y_train, X_validation, y_validation, X_test, y_test

def get_metrics(y_test, preds, pi):
    '''
    y_test: actual target var that we're comparing our predictions to
    preds: predictions of the target var
    pi: probabilities to be used in ROC-AUC calculation
    
    output: several metrics describing the validity of our model
    '''
    acc = accuracy_score(y_test.to_list(), preds)
    prec = precision_score(y_test.to_list(), preds)
    rec = recall_score(y_test.to_list(), preds)
    f1 = f1_score(y_test.to_list(), preds)
    f2 = fbeta_score(y_test.to_list(), preds, beta = 2)
    roc = roc_auc_score(y_test.to_list(), pi)
    our_metric = acc * prec * rec
    return {"ROC-AUC":roc, "F1-Score":f1, "F2-Score":f2, "Precision":prec, "Recall":rec, "Accuracy":acc, "Our Metric":our_metric}

def my_downsample(df, zeroes_to_ones):
    '''
    df: data with target
    zeroes_to_ones: how many zeroes do we want for every one in the data
    
    output: original data but only with all of the 1s and randomly sampled zeroes
    '''
    just_ones_df = df[df["TARGET"] == 1]
    just_zeroes_df =df[df["TARGET"] == 0].sample(n=len(just_ones_df) * zeroes_to_ones, random_state=42)
    complete_df = pd.concat([just_ones_df, just_zeroes_df])
    return complete_df

def extreme_zeroes(df, zeroes_to_ones):
    '''
    df: data with target
    zeroes_to_ones: how many zeroes do we want for every one in the data
    
    output: original data but only with all of the 1s and only the most extreme zeroes
    '''
    X = df.drop(columns=["TARGET"])
    y = df["TARGET"]
    
    #Train a basic logistic regression model with lr=.1, cutoff=.3, penalty=0
    new_betas = gradient_descent_logistic_with_penalty(np.array(X), y, .1, .03, 0)
    preds, pi = new_betas_to_preds(X, new_betas, .5)
    #Collect probabilities
    X["probs"] = pi
    
    #Downsample to only include the most extreme n (based on ds_comp) zeroes and all the ones
    just_ones_df = df[df["TARGET"] == 1]
    ez_indices = X["probs"].sort_values().index[:len(just_ones_df) * zeroes_to_ones].to_list()
    just_zeroes_df = df[df.index.isin(ez_indices)]
    complete_df = pd.concat([just_ones_df, just_zeroes_df])
    return complete_df

def build_model(downsample, zeroes_to_ones, split, test_size, PCA_, PCA_var, df):
    '''
    downsample: "random", "ez", or None to determine how we downsample the data
    zeroes_to_ones: number of zeroes we want for every one in the target variable
    split: "random" or "stratified" to describe how we split our train and test sets
    test_size: what portion of the data we want in our test set
    PCA_: 1 if we want to apply pca and 0 otherwise
    PCA_var: what percentage of the variance do we want explained in our PCA
    df: original data
    
    output: metrics of success for our built model
    '''
    
    X = df.drop(columns=["TARGET"])
    y = df["TARGET"]
    
    if PCA_:
        #initialize PCA
        pca = PCA(n_components=PCA_var)
        #new X with PCA
        X = pd.DataFrame(pca.fit_transform(np.array(X)))
        #new df with PCA X and initial target
        df = pd.concat([X, y], axis = 1)
        
    ult_X_train, ult_y_train, ult_X_validation, ult_y_validation, ult_X_test, ult_y_test = my_split(X, y, test_size, split)
    
    if downsample == "ez":
        df = extreme_zeroes(df, zeroes_to_ones)
    elif downsample == "random":
        df = my_downsample(df, zeroes_to_ones)
        
    X = df.drop(columns=["TARGET"])
    y = df["TARGET"]
        
    X_train, y_train, X_validation, y_validation, X_test, y_test = my_split(X, y, test_size, split)
    
    try:
        w, c = lda_new(X_train, y_train)
        preds, pi = interpret_lda(w, c, ult_X_validation)
    except np.linalg.LinAlgError as err:
        return "Linear Algebra Error"
    
    return get_metrics(ult_y_validation, preds, pi)

In [10]:
build_model('None', 1, 'stratify', .2, 0, 'x', og_df)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'ROC-AUC': 0.5353817480711208,
 'F1-Score': 0.0,
 'F2-Score': 0.0,
 'Precision': 0.0,
 'Recall': 0.0,
 'Accuracy': 0.919271574326247,
 'Our Metric': 0.0}

In [11]:
downsample = ["None", "random", "extreme zeroes"]
ds_comp = [1, 2, 3]
split = ["stratify", "random"]
test_size = [.2, .1]
PCA_ = [0, 1]
PCA_var = [.8, .9, .95]

#Order: downsample, ds_comp, split, ratio, threshold

combinations = []
for a in downsample:
    for b in ds_comp:
        for c in split:
            for d in test_size:
                for e in PCA_:
                    for f in PCA_var:
                        combinations.append([a, b, c, d, e, f])
                            
new_combinations = []
for comb in combinations:
    new_comb = comb
    if not comb[4]:
        new_comb[5] = "x"
    if comb[0] == "None":
        ds_comp = "x"
    new_combinations.append(new_comb)
    
combinations = []
[combinations.append(x) for x in new_combinations if x not in combinations]
len(combinations)

144

In [15]:
from tqdm import tqdm

results = {}
for c in tqdm(combinations):
    metrics = build_model(c[0], c[1], c[2], c[3], c[4], c[5], og_df)
    results[listToString(c)] = metrics
    
better_results = {}
for key in results.keys():
    if results[key] != "Linear Algebra Error":
        better_results[key] = results[key]
    
lda_models = pd.DataFrame(better_results).T
lda_models = lda_models[lda_models["ROC-AUC"] != "x"]
lda_models.sort_values(by=["ROC-AUC", "F1-Score"], ascending = False)#.to_csv("final_lda_results.csv")

Unnamed: 0,ROC-AUC,F1-Score,F2-Score,Precision,Recall,Accuracy,Our Metric
random 1 stratify 0.1 1 0.95,0.742017,0.208112,0.383742,0.118058,0.877350,0.461049,0.047755
random 2 stratify 0.1 1 0.95,0.741940,0.211937,0.386830,0.120863,0.859893,0.483813,0.050282
random 3 stratify 0.1 1 0.95,0.741764,0.208256,0.383937,0.118151,0.877350,0.461519,0.047841
None 1 stratify 0.1 1 0.95,0.740904,0.202867,0.377592,0.114535,0.886750,0.437491,0.044433
None 2 stratify 0.1 1 0.95,0.740904,0.202867,0.377592,0.114535,0.886750,0.437491,0.044433
...,...,...,...,...,...,...,...
None 2 stratify 0.1 0 x,0.477383,0.149381,0.305091,0.080720,1.000000,0.080720,0.006516
None 3 stratify 0.1 0 x,0.477383,0.149381,0.305091,0.080720,1.000000,0.080720,0.006516
extreme zeroes 1 stratify 0.1 0 x,0.477383,0.149381,0.305091,0.080720,1.000000,0.080720,0.006516
extreme zeroes 2 stratify 0.1 0 x,0.477383,0.149381,0.305091,0.080720,1.000000,0.080720,0.006516


## Error Analysis for Best Model

In [17]:
downsample = "random"
zeroes_to_ones = 1
split = "stratify"
test_size = .1
PCA_ = 1
PCA_var = .95
df = og_df.copy()

X = df.drop(columns=["TARGET"])
y = df["TARGET"]

if PCA_:
    #initialize PCA
    pca = PCA(n_components=PCA_var)
    #new X with PCA
    X = pd.DataFrame(pca.fit_transform(np.array(X)))
    #new df with PCA X and initial target
    df = pd.concat([X, y], axis = 1)

ult_X_train, ult_y_train, ult_X_validation, ult_y_validation, ult_X_test, ult_y_test = my_split(X, y, test_size, split)

if downsample == "ez":
    df = extreme_zeroes(df, zeroes_to_ones)
elif downsample == "random":
    df = my_downsample(df, zeroes_to_ones)

X = df.drop(columns=["TARGET"])
y = df["TARGET"]

X_train, y_train, X_validation, y_validation, X_test, y_test = my_split(X, y, test_size, split)

try:
    w, c = lda_new(X_train, y_train)
    preds, pi = interpret_lda(w, c, ult_X_validation)
except np.linalg.LinAlgError as err:
    "Linear Algebra Error"

In [21]:
preds = pd.Series(preds)
preds.index = ult_y_validation.index
pred_vs_true = pd.concat([pd.DataFrame({"preds":preds}), 
           pd.DataFrame({"actual":ult_y_validation})], axis=1)

errors = pred_vs_true[pred_vs_true["preds"] != pred_vs_true["actual"]]
false_positives = pred_vs_true[pred_vs_true["preds"] > pred_vs_true["actual"]].index.to_list()
false_negatives = pred_vs_true[pred_vs_true["preds"] < pred_vs_true["actual"]].index.to_list()

print("false positives:", len(false_positives))
print("false negatives:", len(false_negatives))

fp_df = og_df[og_df.index.isin(false_positives)]
fn_df = og_df[og_df.index.isin(false_negatives)]

false positives: 14642
false negatives: 274
