## New classifiers

In [1]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

def predictpr(fitted, feature_test):
    '''
    This function predicts the probability of response 1 for the test dataset.
    
    fitted: fitted classifier
    feature_test: feature set in test data
    
    return: predictions
    '''
    return fitted.predict_proba(feature_test)[:,1]

def knn_score(x_train, y_train, x_test, n, weights = 'uniform', distance_metric = 'minkowski', p=2):
    '''
    This function builds a KNN classifier.
    
    x_train: training set with features
    y_train: training set with labels
    n: number of neighbors
    weights: weight function used in prediction.
    distance_metric: the distance metric to use
    p: Power parameter for the Minkowski metric. 
    
    returns: fitted KNN classifier
    '''
    if distance_metric == 'minkowski':
        knn = KNeighborsClassifier(n_neighbors=n, weights = weights, p=p, metric= distance_metric)
        
    else:
        knn =KNeighborsClassifier(n_neighbors=n, weights = weights, metric= distance_metric)
    
    knn.fit(x_train, y_train)
    
    return predictpr(knn, x_test)

#can use existing prediction function to predict scores

In [2]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression


def lr_score(x_train, y_train, x_test, p = 'l1', c = 1.0, solver = 'liblinear', seed=12345):
    '''
    This function builds a Logistic Regression model.
    
    x_train: training set with features
    y_train: training set with labels
    p: penalty (l1 or l2)
    c: Inverse of regularization strength; must be a positive float.
    solver: Algorithm to use in the optimization problem.
    seed: random seed
    
    returns fitted Logistic Regression
    '''
    
    lr = LogisticRegression(penalty = p, C = c, solver= solver, random_state = seed)
    lr.fit(x_train, y_train)
    
    return predictpr(lr, x_test)


#can use existing prediction function to predict scores

In [3]:
# SVM

from sklearn.svm import LinearSVC

def linsvc_score(x_train, y_train, x_test, p = 'l2', c = 1.0, seed = 12345):
    '''
    This function builds a fitted linear SVC
    
    x_train: training set with features
    y_train: training set with labels
    p: penalty (l2)
    c: Penalty parameter C of the error term
    seed: random seed
    
    returns fitted linear SVC
    '''
    
    lsvc = LinearSVC(penalty = p, C=c, random_state=seed)
    lsvc.fit(x_train, y_train)
    
    return lsvc.decision_function(x_test)

def svm_score(fitted_lsvc, x_test):
    '''
    This function uses a test set to make scores using Linear SVC.
    
    fitted_lsvc: fitted Linear SVC
    x_test: test set with features
    
    returns predictions
    '''
    return fitted_lsvc.decision_function(x_test)


In [4]:
# Random Forests

In [5]:
# Boosting

In [6]:
# Bagging

## New metrics

In [42]:
# metrics

def accuracy_at_threshold(y_test, pred_scores, thresh =0.5):
    '''
    This function calculates that accuracy of model given a threshold

    y_test: real labels
    pred: prediction scores
    threshold: threshold for predictions

    returns accuracy 
    '''
    pred_one = [1 if x >= thresh else 0 for x in pred_scores]
    
    return metrics.accuracy_score(y_test, pred_one)

def build_cmatrix(y_test, pred_scores, threshold):
    '''
    This function builds a confusion matrix for a given threshold
    
    pred_scores: prediction scores
    y_test: real labels
    threshold: threshold for predictions
    
    returns tuple (true_negatives, false_positive, false_negatives, true_positives)
    '''
    pred = [1 if x >= threshold else 0 for x in pred_scores]
    
    cmatrix = confusion_matrix(y_test, pred)
    
    true_negatives, false_positive, false_negatives, true_positives = cmatrix.ravel()

    return (true_negatives, false_positive, false_negatives, true_positives)

def precision_at_threshold(y_test, pred_scores, thresh =0.5):
    '''
    This function calculates precision of model given a threshold

    y_test: real labels
    pred_scores: prediction scores
    threshold: threshold for predictions

    returns precision
    '''
    pred_one = [1 if x >= thresh else 0 for x in pred_scores]
    
    return metrics.precision_score(y_test, pred_one)

def recall_at_threshold(y_test, pred_scores, thresh =0.5):
    '''
    This function calculates recall of model given a threshold

    y_test: real labels
    pred_scores: prediction scores
    threshold: threshold for predictions

    returns recall
    '''
    pred_one = [1 if x >= thresh else 0 for x in pred_scores]
    
    return metrics.recall_score(y_test, pred_one)

def f1_at_threshold(y_test, pred_scores, thresh =0.5):
    '''
    This function calculates that accuracy of model given a threshold

    y_test: real labels
    pred_scores: prediction scores
    threshold: threshold for predictions

    returns f1 score 
    '''
    pred_one = [1 if x >= thresh else 0 for x in pred_scores]
    
    return metrics.f1_score(y_test, pred_one)

def auc_roc(y_test, pred_scores):
    '''
    This function calculates the area under the ROC curve
    
    y_test: real labels
    pred_scores: prediction scores
    
    returns auc
    '''
    
    return metrics.roc_auc_score(y_test, pred_scores)

def plot_precision_recall(y_test, pred_scores):
    '''
    This function plots the precision recall curve
    
    y_test: true labels
    pred_scores: predicted scores
    
    return: none
    '''
    precision, recall, thresholds = precision_recall_curve(y_test, pred_scores)
    plt.pyplot.plot(recall, precision, marker='.')
    plt.pyplot.show()

# HW3 Analysis

## Import and load data

In [8]:
import ml_pipeline as pp
import pandas as pd
import datetime as dt
import numpy as np
from sklearn import metrics
file = './data/projects_2012_2013.csv'
df = pp.load_csv(file)

## Data transformations

### Helper functions for hw3 specific data

#### Convert date columns to datetime

In [9]:
df.date_posted = pp.col_datetime(df, 'date_posted')

In [10]:
df.datefullyfunded = pp.col_datetime(df,'datefullyfunded')

#### Create labels

In [11]:
df = pp.create_label(df, pred_time=60)

## Selecting features and cleaning

In [12]:
feature_cols=['school_metro','school_charter', 'school_magnet', 'primary_focus_subject', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'total_price_including_optional_support', 'students_reached', 'eligible_double_your_impact_match', 'date_posted', 'label']
sel = df[feature_cols].copy()
sel.head()

Unnamed: 0,school_metro,school_charter,school_magnet,primary_focus_subject,primary_focus_area,resource_type,poverty_level,grade_level,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,date_posted,label
0,urban,f,f,Mathematics,Math & Science,Supplies,highest poverty,Grades PreK-2,1498.61,31.0,f,2013-04-14,1
1,urban,f,f,Civics & Government,History & Civics,Books,highest poverty,Grades 3-5,282.47,28.0,t,2012-04-07,1
2,urban,f,f,Literacy,Literacy & Language,Technology,high poverty,Grades 3-5,1012.38,56.0,f,2012-01-30,0
3,urban,f,t,Literacy,Literacy & Language,Books,high poverty,Grades PreK-2,175.33,23.0,f,2012-10-11,1
4,suburban,f,f,Literacy,Literacy & Language,Technology,high poverty,Grades PreK-2,3591.11,150.0,f,2013-01-08,0


#### Identify feature columns with null values

In [13]:
for x in pp.na_col(df):
    if x in feature_cols:
        print(x)

school_metro
primary_focus_subject
primary_focus_area
resource_type
grade_level
students_reached


#### Impute missing categorical variables with the most frequent, which is a common way to handle missing categorical data without more information. 


In [14]:
cat_cols = ['school_metro','primary_focus_subject','primary_focus_area','resource_type','grade_level']
for x in cat_cols:
    sel = pp.na_fill_col(sel, x , pp.most_freq)

#### Impute missing numerical variable (students_reached) with the median value because there are outliers affecting the mean. 

In [15]:
sel.students_reached.quantile([0.1, 0.25, 0.5, 0.75, 0.9,0.98,1])

0.10       18.0
0.25       23.0
0.50       30.0
0.75      100.0
0.90      200.0
0.98      700.0
1.00    12143.0
Name: students_reached, dtype: float64

In [16]:
sel = pp.na_fill_col(sel, 'students_reached', np.nanmedian)

#### Check that there are no more missing values in feature columns

In [17]:
for x in pp.na_col(sel):
    if x in feature_cols:
        print(x)

#### Discretize numeric features and then get all dummy variables.

In [18]:
# discretize numeric features
bucketdict= {'total_price_including_optional_support': 4, 'students_reached':4}
df_discr = pp.feat_mult_disc(sel, bucketdict, qt=True)

df_discr.total_price_including_optional_support_binned.unique()
df_discr.students_reached_binned.unique()

[(30.0, 100.0], (23.0, 30.0], (0.999, 23.0], (100.0, 12143.0]]
Categories (4, interval[float64]): [(0.999, 23.0] < (23.0, 30.0] < (30.0, 100.0] < (100.0, 12143.0]]

In [19]:
col_to_binary = list(df_discr.columns)
col_to_binary.remove('label')
col_to_binary.remove('date_posted')

In [20]:
col_to_binary

['school_metro',
 'school_charter',
 'school_magnet',
 'primary_focus_subject',
 'primary_focus_area',
 'resource_type',
 'poverty_level',
 'grade_level',
 'eligible_double_your_impact_match',
 'total_price_including_optional_support_binned',
 'students_reached_binned']

In [21]:
# turn variables into dummies
df_final = pp.feat_binary(df_discr, col_to_binary)
df_final.head()

Unnamed: 0,date_posted,label,school_metro_rural,school_metro_suburban,school_metro_urban,school_charter_f,school_charter_t,school_magnet_f,school_magnet_t,primary_focus_subject_Applied Sciences,...,eligible_double_your_impact_match_f,eligible_double_your_impact_match_t,"total_price_including_optional_support_binned_(91.999, 345.81]","total_price_including_optional_support_binned_(345.81, 510.5]","total_price_including_optional_support_binned_(510.5, 752.96]","total_price_including_optional_support_binned_(752.96, 164382.84]","students_reached_binned_(0.999, 23.0]","students_reached_binned_(23.0, 30.0]","students_reached_binned_(30.0, 100.0]","students_reached_binned_(100.0, 12143.0]"
0,2013-04-14,1,0,0,1,1,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
1,2012-04-07,1,0,0,1,1,0,1,0,0,...,0,1,1,0,0,0,0,1,0,0
2,2012-01-30,0,0,0,1,1,0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
3,2012-10-11,1,0,0,1,1,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
4,2013-01-08,0,0,1,0,1,0,1,0,0,...,1,0,0,0,0,1,0,0,0,1


## Run variations of models: 
### Decision trees, KNN, Logistic Regression, Linear SVM, Random forests, Bagging, Boosting

In [108]:
windows = [dt.datetime(2012,1,1), dt.datetime(2012,7,1), dt.datetime(2013,1,1), dt.datetime(2013,7,1), dt.datetime(2014,1,1)]
pred_time = 60 #days
label_col = 'label'
split_col = 'date_posted'
feature_cols= list(df_final.columns)
feature_cols.remove('label')
feature_cols.remove('date_posted')
seed=12345

In [109]:
models = [
    {'type': 'Dtree', 'clf': pp.dtree_score, 'criteria': ['entropy', 'gini'], 'depth': [10,20,30],'min_leaf': [100, 300,500], 'seed': seed},
    {'type': 'LR', 'clf': lr_score, 'p': ['l1','l2'], 'c': [0.1, 1.0, 10.0, 100.0], 'solver': ['liblinear'], 'seed': seed},
    {'type': 'SVM', 'clf': linsvc_score, 'p': ['l2'], 'c': [0.1, 1.0, 10.0, 100.0], 'seed': seed},
    {'type': 'Bagging_dtree', 'clf': bagging_score, 'n': [10, 50, 100], 'base':[None], 'seed':seed},
    {'type': 'ADABoost_dtree', 'clf': adaboost_score, 'n': [10, 50, 100], 'base':[None], 'seed':seed},
    {'type': 'Random Forest', 'clf': rforest_score, 'n': [10, 50, 100], 'criterion': ['entropy', 'gini'], 'seed': seed},
    {'type': 'KNN', 'clf': knn_score, 'n': [5], 'weights': ['uniform','distance'], 'distance_metric':['minkowski'],'p': [1,2]}
]

#models = [{'type': 'Random Forest', 'clf': rforest_score, 'n': [10, 50, 100], 'criterion': ['entropy', 'gini'], 'seed': seed}]
thresholds = [1, 2, 5, 10, 20,30, 50]


In [110]:
results = []

for i in range(1, len(windows)-1):
    train_start = windows[0]
    train_end = windows[i]
    test_end = windows[i+1]
    
    #split data
    x_train,y_train,x_test,y_test = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          train_start,
                                                          train_end, 
                                                          test_end, 
                                                          pred_time=pred_time)
    
    
    baseline = sum(y_test)/len(y_test)
    #run models
    for clf in models:
        modeltype = clf['type']
        func = clf['clf']
        printinfo = 'model: {}, run: {}'.format(modeltype, i)
        print(printinfo)
        if modeltype == 'Dtree':
            seed = clf['seed']
            for c in clf['criteria']:
                for d in clf['depth']:
                    for l in clf['min_leaf']:
                        #run model
                        info = 'criteria: {}, depth: {}, min_leaf: {}, seed: {}'.format(c, d, l, seed)
                        print(info)
                        scores = func(x_train, y_train, x_test, criteria = c, depth = d, min_leaf = l, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline, 'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                   'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
                            
        elif modeltype == 'LR':
            seed = clf['seed']
            for p in clf['p']:
                for c in clf['c']:
                    for s in clf['solver']:
                        #print(p)
                        info = 'penalty: {}, c: {}, solver: {}, seed: {}'.format(p, c, s, seed)
                        print(info)
                        scores = func(x_train, y_train, x_test, p = p, c = c, solver = s, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                   'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
                        
        elif modeltype == 'SVM':
            seed = clf['seed']
            for p in clf['p']:
                for c in clf['c']:
                    info = 'penalty: {}, c: {}, seed: {}'.format(p, c, seed)
                    print(info)
                    scores = func(x_train, y_train, x_test, p =p, c=c, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
                            
        elif modeltype == 'KNN':
            for n in clf['n']:
                for w in clf['weights']:
                    for d in clf['distance_metric']:
                        for p in clf['p']:
                            info = 'n: {}, weights: {}, distance: {}, p: {}'.format(n, w, d, p)
                            print(info)
                            scores = func(x_train, y_train, x_test, n = n, weights = w, distance_metric = d, p=p)
                            #print(list(scores))
                            for pct_pop in thresholds:
                                acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                                tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                       'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                                results.append(tmp)
        
        elif modeltype == 'Bagging_dtree':
            seed = clf['seed']
            for n in clf['n']:
                for b in clf['base']:
                    info = 'n: {}, base: {}'.format(n,b)
                    scores = func(x_train, y_train, x_test, n = n, base = b, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
        
        elif modeltype == 'ADABoost_dtree':
            seed = clf['seed']
            for n in clf['n']:
                for b in clf['base']:
                    info = 'n: {}, base: {}'.format(n,b)
                    scores = func(x_train, y_train, x_test, n = n, base = b, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
            
            
        elif modeltype == 'Random Forest':
            seed = clf['seed']
            for n in clf['n']:
                for c in clf['criterion']:
                    info = 'n: {}, criterion: {}'.format(n,c)
                    scores = func(x_train, y_train, x_test, n = n, criterion = c, seed=seed)
                    for pct_pop in thresholds:
                        acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                        tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                               'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                        results.append(tmp)
                    
                    
resdf=pd.DataFrame(results, columns = ['type', 'details', 'baseline', 'threshold_pct', 'precision', 'recall', 'auc','train_set_num', 'train_start', 'test_start'])    

model: Dtree, run: 1
criteria: entropy, depth: 10, min_leaf: 100, seed: 12345
criteria: entropy, depth: 10, min_leaf: 300, seed: 12345
criteria: entropy, depth: 10, min_leaf: 500, seed: 12345
criteria: entropy, depth: 20, min_leaf: 100, seed: 12345
criteria: entropy, depth: 20, min_leaf: 300, seed: 12345
criteria: entropy, depth: 20, min_leaf: 500, seed: 12345
criteria: entropy, depth: 30, min_leaf: 100, seed: 12345
criteria: entropy, depth: 30, min_leaf: 300, seed: 12345
criteria: entropy, depth: 30, min_leaf: 500, seed: 12345
criteria: gini, depth: 10, min_leaf: 100, seed: 12345
criteria: gini, depth: 10, min_leaf: 300, seed: 12345
criteria: gini, depth: 10, min_leaf: 500, seed: 12345
criteria: gini, depth: 20, min_leaf: 100, seed: 12345
criteria: gini, depth: 20, min_leaf: 300, seed: 12345
criteria: gini, depth: 20, min_leaf: 500, seed: 12345
criteria: gini, depth: 30, min_leaf: 100, seed: 12345
criteria: gini, depth: 30, min_leaf: 300, seed: 12345
criteria: gini, depth: 30, min_lea



penalty: l2, c: 100.0, seed: 12345




model: Bagging_dtree, run: 1
model: ADABoost_dtree, run: 1
model: Random Forest, run: 1
model: KNN, run: 1
n: 5, weights: uniform, distance: minkowski, p: 1
n: 5, weights: uniform, distance: minkowski, p: 2
n: 5, weights: distance, distance: minkowski, p: 1
n: 5, weights: distance, distance: minkowski, p: 2
model: Dtree, run: 2
criteria: entropy, depth: 10, min_leaf: 100, seed: 12345
criteria: entropy, depth: 10, min_leaf: 300, seed: 12345
criteria: entropy, depth: 10, min_leaf: 500, seed: 12345
criteria: entropy, depth: 20, min_leaf: 100, seed: 12345
criteria: entropy, depth: 20, min_leaf: 300, seed: 12345
criteria: entropy, depth: 20, min_leaf: 500, seed: 12345
criteria: entropy, depth: 30, min_leaf: 100, seed: 12345
criteria: entropy, depth: 30, min_leaf: 300, seed: 12345
criteria: entropy, depth: 30, min_leaf: 500, seed: 12345
criteria: gini, depth: 10, min_leaf: 100, seed: 12345
criteria: gini, depth: 10, min_leaf: 300, seed: 12345
criteria: gini, depth: 10, min_leaf: 500, seed: 1



penalty: l2, c: 100.0, seed: 12345




model: Bagging_dtree, run: 2
model: ADABoost_dtree, run: 2
model: Random Forest, run: 2
model: KNN, run: 2
n: 5, weights: uniform, distance: minkowski, p: 1
n: 5, weights: uniform, distance: minkowski, p: 2
n: 5, weights: distance, distance: minkowski, p: 1
n: 5, weights: distance, distance: minkowski, p: 2
model: Dtree, run: 3
criteria: entropy, depth: 10, min_leaf: 100, seed: 12345
criteria: entropy, depth: 10, min_leaf: 300, seed: 12345
criteria: entropy, depth: 10, min_leaf: 500, seed: 12345
criteria: entropy, depth: 20, min_leaf: 100, seed: 12345
criteria: entropy, depth: 20, min_leaf: 300, seed: 12345
criteria: entropy, depth: 20, min_leaf: 500, seed: 12345
criteria: entropy, depth: 30, min_leaf: 100, seed: 12345
criteria: entropy, depth: 30, min_leaf: 300, seed: 12345
criteria: entropy, depth: 30, min_leaf: 500, seed: 12345
criteria: gini, depth: 10, min_leaf: 100, seed: 12345
criteria: gini, depth: 10, min_leaf: 300, seed: 12345
criteria: gini, depth: 10, min_leaf: 500, seed: 1



penalty: l2, c: 100.0, seed: 12345




model: Bagging_dtree, run: 3
model: ADABoost_dtree, run: 3
model: Random Forest, run: 3
model: KNN, run: 3
n: 5, weights: uniform, distance: minkowski, p: 1
n: 5, weights: uniform, distance: minkowski, p: 2
n: 5, weights: distance, distance: minkowski, p: 1
n: 5, weights: distance, distance: minkowski, p: 2


In [112]:
resdf[resdf['type'] == 'SVM']

Unnamed: 0,type,details,baseline,threshold_pct,precision,recall,auc,train_set_num,train_start,test_start
182,SVM,"penalty: l2, c: 0.1, seed: 12345",0.743083,1,0.930303,0.012535,0.504909,1,2012-01-01,2012-07-01
183,SVM,"penalty: l2, c: 0.1, seed: 12345",0.743083,2,0.825493,0.022211,0.504315,1,2012-01-01,2012-07-01
184,SVM,"penalty: l2, c: 0.1, seed: 12345",0.743083,5,0.766990,0.051609,0.503131,1,2012-01-01,2012-07-01
185,SVM,"penalty: l2, c: 0.1, seed: 12345",0.743083,10,0.753034,0.101339,0.502606,1,2012-01-01,2012-07-01
186,SVM,"penalty: l2, c: 0.1, seed: 12345",0.743083,20,0.748786,0.201535,0.502988,1,2012-01-01,2012-07-01
187,SVM,"penalty: l2, c: 0.1, seed: 12345",0.743083,30,0.745449,0.300955,0.501859,1,2012-01-01,2012-07-01
188,SVM,"penalty: l2, c: 0.1, seed: 12345",0.743083,50,0.741808,0.499143,0.498331,1,2012-01-01,2012-07-01
189,SVM,"penalty: l2, c: 1.0, seed: 12345",0.743083,1,0.927273,0.012494,0.504830,1,2012-01-01,2012-07-01
190,SVM,"penalty: l2, c: 1.0, seed: 12345",0.743083,2,0.825493,0.022211,0.504315,1,2012-01-01,2012-07-01
191,SVM,"penalty: l2, c: 1.0, seed: 12345",0.743083,5,0.765170,0.051486,0.502892,1,2012-01-01,2012-07-01


In [113]:
resdf.to_csv('finalrun.csv')

In [30]:
#### Use temporal validation to split into training/test sets



x_train1,y_train1,x_test1,y_test1 = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[1], 
                                                          windows[2], 
                                                          pred_time=60)

x_train2, y_train2, x_test2, y_test2 = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[2], 
                                                          windows[3], 
                                                          pred_time=60)

x_train3, y_train3, x_test3, y_test3 = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[3], 
                                                          windows[4], 
                                                          pred_time=60)

In [100]:
test = linsvc_score(x_train1, y_train1, x_test1, p='l2', c=0.1, seed=12345)
#all_metrics(y_test1, test, 1)
test

array([0.59249728, 0.23252579, 0.09679977, ..., 0.62204465, 0.62930204,
       0.34233743])

In [101]:
round(len(test)*0.01,0)

330.0

In [102]:
testdf = pd.Series(test)
idxtestdf = testdf.sort_values(ascending=False)[0:330].index
idxtestdf

Int64Index([  290, 28297, 14704, 14551,  2849,  5898,  2946, 11235, 16977,
            12828,
            ...
            25522, 12997, 21633, 25774, 28395, 28144,  8868, 16439, 28460,
            21175],
           dtype='int64', length=330)

In [107]:
scores_pctpop(test,1).iloc[290]

1.0

In [82]:
accuracy_at_threshold(y_test1, testdf, 0.5)

0.26407766990291265

In [81]:
testdf.iloc[:] = 0
testdf.iloc[idxtestdf] = 1
sum(testdf)/len(testdf)

0.010012135922330098

In [39]:
test2 = [1 if x > 1.0 else 0 for x in test]
metrics.precision_score(y_test1, test2)

  'precision', 'predicted', average, warn_for)


0.0

In [None]:
sum(y_test1)

In [None]:
clf = knn_score(x_train1, y_train1, x_test1, n=3, p=1)
clf

In [None]:
accuracy_at_threshold(y_test1, clf, 0.5)

In [None]:
precision, recall, thresholds = metrics.precision_recall_curve(y_test1, x)
p = precision[0:-1]
r = recall[0:-1]
t = 1-thresholds

In [None]:
t = np.percentile(x,90)

In [None]:
np.median(x)

In [None]:
lr_test = lr_score(x_train1, y_train1, x_test1)

In [None]:
test = [1 if x > t else 0 for x in lr_test]

In [None]:
sum(test)/len(test)

In [None]:
t=np.percentile(lr_test, 60)

In [None]:
adaboost_score(x_train1, y_train1, x_test1, 15, seed=1)

In [24]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
def adaboost_score(x_train, y_train, x_test, n, base = None, seed=12345):
    '''
    This function creates and predicts scores using bagging.
    
    x_train: training set features
    y_train: training set labels
    x_test: test set features
    n_estimators: number of estimators to be in bagging
    base: base classifier, default None is Dtree
    n_jobs = jobs to do in parallel
    
    returns: predicted scores 
    '''
    
    ada = AdaBoostClassifier(base_estimator = base, n_estimators=n, random_state=seed)
    ada.fit(x_train, y_train)
    return predictpr(ada,x_test)

def bagging_score(x_train, y_train, x_test, n, base = None, n_jobs = 1, seed= 12345):
    '''
    This function creates and predicts scores using bagging.
    
    x_train: training set features
    y_train: training set labels
    x_test: test set features
    n_estimators: number of estimators to be in bagging
    base: base classifier, default None is Dtree
    n_jobs = jobs to do in parallel
    
    returns: predicted scores 
    '''
    
    bag = BaggingClassifier(base_estimator=base, n_estimators= n, n_jobs = n_jobs, random_state = seed)
    bag.fit(x_train, y_train)
    return predictpr(bag, x_test)

def rforest_score(x_train, y_train, x_test, n, criterion = 'entropy', max_depth = None, n_jobs= None, seed=seed):
    '''
    This function returns probabilities from a Random Forest Classifier
    
    x_train: training set features
    y_train: training set labels
    x_test: test set features
    n: n_estimators
    criterion: entropy or gig
    max_depth: depth of tree, None default means nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
    n_jobs: number of jobs to run in parallel for both fit and predict. None means 1
    seed: random seed
    
    return: prediction scores
    '''
    
    rf = RandomForestClassifier(n_estimators=n, criterion= criterion, max_depth=max_depth, n_jobs=n_jobs, random_state=seed)
    rf.fit(x_train, y_train)
    
    return predictpr(rf, x_test)

In [105]:
def scores_pctpop(pred_scores, pct_pop):
    
    #identify number of positives to have given target percent of population
    num_pos = int(round(len(pred_scores)*(pct_pop/100),0))
    #turn predictions into series
    pred_df = pd.Series(pred_scores)
    idx = pred_df.sort_values(ascending=False)[0:num_pos].index 
    
    #set all observations to 0
    pred_df.iloc[:] = 0
    #set observations by index (the ones ranked high enough) to 1
    pred_df.iloc[idx] = 1
    
    return pred_df

def all_metrics(y_test, pred_scores, t, target_pop = True):
    '''
    This function returns the accuracy, precision, recall, f1, and auc_roc for a given target percent of population
    
    y_test: tests set labels
    pred_scores: prediction scores
    t: threshold (either decimal as threshold or integer as % target population (50 is 50%))
    target_pop: boolean to decide whether to use t as threshold or target pop
    
    return: tuple with accuracy, precision, recall, f1, and auc_roc
    '''
    if target_pop:
        pred_scores = scores_pctpop(pred_scores, t)
        t = 0.5

    acc = accuracy_at_threshold(y_test, pred_scores, t)
    prec = precision_at_threshold(y_test, pred_scores, t)
    rec = recall_at_threshold(y_test, pred_scores, t)
    f1 = f1_at_threshold(y_test, pred_scores, t)
    auc = auc_roc(y_test, pred_scores)
    
    return (acc, prec, rec, f1, auc)

In [115]:
def run_models(models, thresholds, windows, df_final, feature_cols, label_col, split_col, pred_time, pred_unit = 'day', filename = ''):
    '''
    This function runs multiple models with multiple parameters and calculates metrics according to thresholds

    models: list of dictionaries, each one is a model type with parameters
    thresholds: list of thresholds to calculate metrics against for each model
    windows: list of start and end dates for time windows
    feature_cols: list of strings, column names
    label_col: column name of label
    split_col: column name of column that has the dates to split on
    pred_time: prediction window
    pred_unit: time unit for prediction window
    filename: csv filename to save results

    returns: dataframe
    '''

    results = []

    for i in range(1, len(windows)-1):
        train_start = windows[0]
        train_end = windows[i]
        test_end = windows[i+1]
        
        #split data
        x_train,y_train,x_test,y_test = pp.single_train_test_set(df_final, 
                                                            feature_cols, 
                                                            label_col, 
                                                            split_col, 
                                                            train_start,
                                                            train_end, 
                                                            test_end, 
                                                            pred_time=pred_time, pred_unit = pred_unit)
        
        
        baseline = sum(y_test)/len(y_test)
        #run models
        for clf in models:
            modeltype = clf['type']
            func = clf['clf']
            printinfo = 'model: {}, run: {}'.format(modeltype, i)
            print(printinfo)
            if modeltype == 'Dtree':
                seed = clf['seed']
                for c in clf['criteria']:
                    for d in clf['depth']:
                        for l in clf['min_leaf']:
                            #run model
                            info = 'criteria: {}, depth: {}, min_leaf: {}, seed: {}'.format(c, d, l, seed)
                            print(info)
                            scores = func(x_train, y_train, x_test, criteria = c, depth = d, min_leaf = l, seed=seed)
                            for pct_pop in thresholds:
                                acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                                tmp = {'baseline': baseline, 'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                    'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                                results.append(tmp)
                                
            elif modeltype == 'LR':
                seed = clf['seed']
                for p in clf['p']:
                    for c in clf['c']:
                        for s in clf['solver']:
                            #print(p)
                            info = 'penalty: {}, c: {}, solver: {}, seed: {}'.format(p, c, s, seed)
                            print(info)
                            scores = func(x_train, y_train, x_test, p = p, c = c, solver = s, seed=seed)
                            for pct_pop in thresholds:
                                acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                                tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                    'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                                results.append(tmp)
                            
            elif modeltype == 'SVM':
                seed = clf['seed']
                for p in clf['p']:
                    for c in clf['c']:
                        info = 'penalty: {}, c: {}, seed: {}'.format(p, c, seed)
                        print(info)
                        scores = func(x_train, y_train, x_test, p =p, c=c, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
                                
            elif modeltype == 'KNN':
                for n in clf['n']:
                    for w in clf['weights']:
                        for d in clf['distance_metric']:
                            for p in clf['p']:
                                info = 'n: {}, weights: {}, distance: {}, p: {}'.format(n, w, d, p)
                                print(info)
                                scores = func(x_train, y_train, x_test, n = n, weights = w, distance_metric = d, p=p)
                                #print(list(scores))
                                for pct_pop in thresholds:
                                    acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                                    tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                        'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                                    results.append(tmp)
            
            elif modeltype == 'Bagging_dtree':
                seed = clf['seed']
                for n in clf['n']:
                    for b in clf['base']:
                        info = 'n: {}, base: {}'.format(n,b)
                        scores = func(x_train, y_train, x_test, n = n, base = b, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
            
            elif modeltype == 'ADABoost_dtree':
                seed = clf['seed']
                for n in clf['n']:
                    for b in clf['base']:
                        info = 'n: {}, base: {}'.format(n,b)
                        scores = func(x_train, y_train, x_test, n = n, base = b, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
                
                
            elif modeltype == 'Random Forest':
                seed = clf['seed']
                for n in clf['n']:
                    for c in clf['criterion']:
                        info = 'n: {}, criterion: {}'.format(n,c)
                        scores = func(x_train, y_train, x_test, n = n, criterion = c, seed=seed)
                        for pct_pop in thresholds:
                            acc, prec, rec, f1, auc = all_metrics(y_test, scores, pct_pop)
                            tmp = {'baseline': baseline,'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                'details': info, 'threshold_pct': pct_pop, 'precision': prec, 'recall': rec, 'auc': auc}
                            results.append(tmp)
                                      
    resdf = pd.DataFrame(results, columns = ['type', 'details', 'baseline', 'threshold_pct', 'precision', 'recall', 'auc','train_set_num', 'train_start', 'test_start'])    
    if filename:
        resdf.to_csv(filename)
    return resdf

In [116]:
x=run_models(models, thresholds, windows, df_final, feature_cols, label_col, split_col, pred_time, pred_unit = 'day', filename = '')

model: Dtree, run: 1
criteria: entropy, depth: 10, min_leaf: 100, seed: 12345
criteria: entropy, depth: 10, min_leaf: 300, seed: 12345
criteria: entropy, depth: 10, min_leaf: 500, seed: 12345
criteria: entropy, depth: 20, min_leaf: 100, seed: 12345
criteria: entropy, depth: 20, min_leaf: 300, seed: 12345
criteria: entropy, depth: 20, min_leaf: 500, seed: 12345
criteria: entropy, depth: 30, min_leaf: 100, seed: 12345
criteria: entropy, depth: 30, min_leaf: 300, seed: 12345


KeyboardInterrupt: 