## New classifiers

In [4]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

def predictpr(fitted, feature_test):
    '''
    This function predicts the probability of response 1 for the test dataset.
    
    fitted: fitted classifier
    feature_test: feature set in test data
    
    return: predictions
    '''
    return fitted.predict_proba(feature_test)[:,1]

def knn_score(x_train, y_train, x_test, n, weights = 'uniform', distance_metric = 'minkowski', p=2):
    '''
    This function builds a KNN classifier.
    
    x_train: training set with features
    y_train: training set with labels
    n: number of neighbors
    weights: weight function used in prediction.
    distance_metric: the distance metric to use
    p: Power parameter for the Minkowski metric. 
    
    returns: fitted KNN classifier
    '''
    if distance_metric == 'minkowski':
        knn = KNeighborsClassifier(n_neighbors=10, weights = weights, p=p, metric= distance_metric)
        
    else:
        knn =KNeighborsClassifier(n_neighbors=10, weights = weights, metric= distance_metric)
    
    fitted = knn.fit(x_train, y_train)
    
    return predictpr(fitted, x_test)

#can use existing prediction function to predict scores

In [5]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression


def lr_score(x_train, y_train, x_test, p = 'l1', c = 1.0, solver = 'liblinear', seed=12345):
    '''
    This function builds a Logistic Regression model.
    
    x_train: training set with features
    y_train: training set with labels
    p: penalty (l1 or l2)
    c: Inverse of regularization strength; must be a positive float.
    solver: Algorithm to use in the optimization problem.
    seed: random seed
    
    returns fitted Logistic Regression
    '''
    
    lr = LogisticRegression(penalty = p, C = c, solver= solver, random_state = seed)
    fitted = lr.fit(x_train, y_train)
    
    return predictpr(fitted, x_test)


#can use existing prediction function to predict scores

In [6]:
# SVM

from sklearn.svm import LinearSVC

def linsvc_score(x_train, y_train, x_test, p = 'l2', c = 1.0, seed = 12345):
    '''
    This function builds a fitted linear SVC
    
    x_train: training set with features
    y_train: training set with labels
    p: penalty (l2)
    c: Penalty parameter C of the error term
    seed: random seed
    
    returns fitted linear SVC
    '''
    
    lsvc = LinearSVC(penalty = 'l2', C=c, seed=seed)
    lsvc.fit(x_train, y_train)
    
    return lsvc.decision_function(x_test)

def svm_score(fitted_lsvc, x_test):
    '''
    This function uses a test set to make scores using Linear SVC.
    
    fitted_lsvc: fitted Linear SVC
    x_test: test set with features
    
    returns predictions
    '''
    return fitted_lsvc.decision_function(x_test)


In [7]:
# Random Forests

In [8]:
# Boosting

In [9]:
# Bagging

## New metrics

In [10]:
# metrics

def build_cmatrix(y_test, pred_scores, threshold):
    '''
    This function builds a confusion matrix for a given threshold
    
    pred_scores: prediction scores
    y_test: real labels
    threshold: threshold for predictions
    
    returns tuple (true_negatives, false_positive, false_negatives, true_positives)
    '''
    pred = [1 if x > threshold else 0 for x in pred_scores]
    
    cmatrix = confusion_matrix(y_test, pred)
    
    true_negatives, false_positive, false_negatives, true_positives = cmatrix.ravel()

    return (true_negatives, false_positive, false_negatives, true_positives)

def precision_at_threshold(y_test, pred_scores, thresh =0.5):
    '''
    This function calculates precision of model given a threshold

    y_test: real labels
    pred_scores: prediction scores
    threshold: threshold for predictions

    returns precision
    '''
    pred_one = [1 if x > threshold else 0 for x in pred_scores]
    
    return metrics.precision_score(y_test, pred_one)

def recall_at_threshold(y_test, pred_scores, thresh =0.5):
    '''
    This function calculates recall of model given a threshold

    y_test: real labels
    pred_scores: prediction scores
    threshold: threshold for predictions

    returns recall
    '''
    pred_one = [1 if x > threshold else 0 for x in pred_scores]
    
    return metrics.recall_score(y_test, pred_one)

def f1_at_threshold(y_test, pred_scores, thresh =0.5):
    '''
    This function calculates that accuracy of model given a threshold

    y_test: real labels
    pred_scores: prediction scores
    threshold: threshold for predictions

    returns f1 score 
    '''
    pred_one = [1 if x > threshold else 0 for x in pred_scores]
    
    return metrics.f1_score(y_test, pred_one)

def auc_roc(y_test, pred_scores):
    '''
    This function calculates the area under the ROC curve
    
    y_test: real labels
    pred_scores: prediction scores
    
    returns auc
    '''
    
    return metrics.roc_auc_score(y_test, pred_scores)

def plot_precision_recall(y_test, pred_scores):
    '''
    This function plots the precision recall curve
    
    y_test: true labels
    pred_scores: predicted scores
    
    return: none
    '''
    precision, recall, thresholds = precision_recall_curve(y_test, pred_scores)
    plt.pyplot.plot(recall, precision, marker='.')
    plt.pyplot.show()

# HW3 Analysis

## Import and load data

In [None]:
import ml_pipeline as pp
import pandas as pd
import datetime as dt
import numpy as np
import monthdelta as md
file = './data/projects_2012_2013.csv'
df = pp.load_csv(file)

## Data transformations

In [12]:
import ml_pipeline as pp
import pandas as pd
import datetime as dt
import numpy as np
import monthdelta as md
file = './data/projects_2012_2013.csv'
df = pp.load_csv(file)

### Helper functions for hw3 specific data

In [14]:
def col_datetime(df, tgt_col):
    '''
    Turn column values into datetime
    
    df: dataframe
    tgt_col: column name
    
    returns pandas series with datetime
    '''
    return pd.to_datetime(df[tgt_col])

def create_label(df, pred_time = 60, pred_unit = 'day'):
    '''
    This function creates a label column in the dataset given a time horizon
    
    df: dataframe
    pred_time: prediction time horizon
    pred_unit: unit of time of prediction (i.e. day or month or year)
    
    return dataframe
    '''
    def label(row, pred_time, pred_unit):
        if 'day' in pred_unit:
            diff = pred_time
        elif 'month' in pred_unit:
            diff = pred_time * 30 #convert to days
        elif 'year' in pred_unit:
            diff = pred_time * 365 #convert to days
            
        if (row.datefullyfunded - row.date_posted).days <= diff:
            return 1
        else:
            return 0

    df['label'] = df.apply(lambda row: label(row, pred_time, pred_unit), axis=1)
    
    return df

def single_train_test_set(df, feature_cols, label_col, split_col, train_start, train_end, test_end, pred_time, pred_unit = 'day'):
    '''
    This function builds a single temporal training and test set
    
    df: dataframe with all data
    feature_cols: list of feature columns
    label_col: label column string
    split_col: column with date to split on
    train_start: date to begin training data
    train_end: date to end training data
    test_end: date to end test data
    pred_time: time horizon for predictions (day, month or year)
    
    returns tuple of x_train, y_train, x_test, y_test
    '''
    
    if 'day' in pred_unit:
        actual_train_end = train_end - dt.timedelta(days=pred_time)
    elif 'month' in pred_unit:
        actual_train_end = train_end - md.monthdelta(pred_time)
    elif 'year' in pred_unit.contains:
        actual_train_end = dt.datetime(train_end.year - pred_time, train_end.month, train_end.day)
        
    training_set = df[(df[split_col] >= train_start) & (df[split_col] <= actual_train_end)]
    
    test_set = df[(df[split_col] >= train_end) & (df[split_col] <= test_end)]
    
    return (training_set[feature_cols], training_set[label_col], test_set[feature_cols], test_set[label_col])

#### Convert date columns to datetime

In [15]:
df.date_posted = pp.col_datetime(df, 'date_posted')

In [16]:
df.datefullyfunded = pp.col_datetime(df,'datefullyfunded')

#### Create labels

In [20]:
df = pp.create_label(df, pred_time=60)

## Selecting features and cleaning

In [28]:
feature_cols=['school_state', 'school_metro','school_charter', 'school_magnet', 'primary_focus_subject', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'total_price_including_optional_support', 'students_reached', 'eligible_double_your_impact_match']
sel = df[feature_cols].copy()
sel.head()

Unnamed: 0,school_state,school_metro,school_charter,school_magnet,primary_focus_subject,primary_focus_area,resource_type,poverty_level,grade_level,total_price_including_optional_support,students_reached,eligible_double_your_impact_match
0,IL,urban,f,f,Mathematics,Math & Science,Supplies,highest poverty,Grades PreK-2,1498.61,31.0,f
1,CA,urban,f,f,Civics & Government,History & Civics,Books,highest poverty,Grades 3-5,282.47,28.0,t
2,CA,urban,f,f,Literacy,Literacy & Language,Technology,high poverty,Grades 3-5,1012.38,56.0,f
3,NY,urban,f,t,Literacy,Literacy & Language,Books,high poverty,Grades PreK-2,175.33,23.0,f
4,NY,suburban,f,f,Literacy,Literacy & Language,Technology,high poverty,Grades PreK-2,3591.11,150.0,f


#### Identify feature columns with null values

In [29]:
for x in pp.na_col(df):
    if x in feature_cols:
        print(x)

school_metro
primary_focus_subject
primary_focus_area
resource_type
grade_level
students_reached


#### Impute missing categorical variables with the most frequent, which is a common way to handle missing categorical data without more information. 


In [32]:
cat_cols = ['school_metro','primary_focus_subject','primary_focus_area','resource_type','grade_level']
for x in cat_cols:
    sel = pp.na_fill_col(sel, x , most_freq)

#### Impute missing numerical variable (students_reached) with the median value because there are outliers affecting the mean. 

In [39]:
sel.students_reached.quantile([0.1, 0.25, 0.5, 0.75, 0.9,0.98,1])

0.10       18.0
0.25       23.0
0.50       30.0
0.75      100.0
0.90      200.0
0.98      700.0
1.00    12143.0
Name: students_reached, dtype: float64

In [51]:
sel = pp.na_fill_col(sel, 'students_reached', np.nanmedian)

school_state                                               PA
school_metro                                         suburban
school_charter                                              t
school_magnet                                               f
primary_focus_subject                                Literacy
primary_focus_area                        Literacy & Language
resource_type                                      Technology
poverty_level                                 highest poverty
grade_level                                     Grades PreK-2
total_price_including_optional_support                 718.59
students_reached                                           30
eligible_double_your_impact_match                           f
Name: 1816, dtype: object

#### Discretize numeric features and then get all dummy variables.

In [75]:
# discretize numeric features
bucketdict= {'total_price_including_optional_support': 4, 'students_reached':4}
df_discr = pp.feat_mult_disc(sel, bucketdict, qt=True)

df_discr.total_price_including_optional_support_binned.unique()
df_discr.students_reached_binned.unique()

school_state                                       object
school_metro                                       object
school_charter                                     object
school_magnet                                      object
primary_focus_subject                              object
primary_focus_area                                 object
resource_type                                      object
poverty_level                                      object
grade_level                                        object
eligible_double_your_impact_match                  object
total_price_including_optional_support_binned    category
students_reached_binned                          category
dtype: object

In [77]:
# turn variables into dummies
df_final = pp.feat_binary(df_discr, df_discr.columns)
df_final.head()

Unnamed: 0,school_state_AK,school_state_AL,school_state_AR,school_state_AZ,school_state_CA,school_state_CO,school_state_CT,school_state_DC,school_state_DE,school_state_FL,...,eligible_double_your_impact_match_f,eligible_double_your_impact_match_t,"total_price_including_optional_support_binned_(91.999, 345.81]","total_price_including_optional_support_binned_(345.81, 510.5]","total_price_including_optional_support_binned_(510.5, 752.96]","total_price_including_optional_support_binned_(752.96, 164382.84]","students_reached_binned_(0.999, 23.0]","students_reached_binned_(23.0, 30.0]","students_reached_binned_(30.0, 100.0]","students_reached_binned_(100.0, 12143.0]"
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,...,0,1,1,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1


## Run variations of models: 
### Decision trees, KNN, Logistic Regression, Linear SVM, Random forests, Bagging, Boosting

In [23]:
windows = [dt.datetime(2012,1,1), dt.datetime(2012,7,1), dt.datetime(2013,1,1), dt.datetime(2013,7,1), dt.datetime(2014,1,1)]
pred_time = 60 #days
feature_cols=['school_state', 'school_metro','school_charter', 'school_magnet', 'primary_focus_subject', 'primary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'total_price_including_optional_support', 'students_reached', 'eligible_double_your_impact_match']
label_col = 'label'
split_col = 'date_posted'
seed=12345

In [None]:
models = [
    {'type': 'Dtree', 'clf': pp.dtree_score, 'criteria': ['entropy', 'gini'], 'depth': [5,10,15],'min_leaf': [300,500,700], 'seed': seed},
    {'type': 'KNN', 'clf': knn_score, 'n': [3,5,15], 'weights': ['uniform','distance'], 'distance_metric':['minkowski'], 'p': [1,2,3,4]},
    {'type': 'LR', 'clf': lr_score, 'p': ['l1','l2'], 'c': [0.1, 1.0, 10.0, 100.0], solver = ['liblinear'], 'seed': seed},
    {'type': 'SVM', 'clf': linsvc_score, 'p': ['l2'], 'c': [0.1, 1.0, 10.0, 100.0], 'seed': seed}
]

thresholds = [0.3, 0.5, 0.7]
svm_thresh = [-0.5, 0, 0.5]

In [79]:
results = []

for i in range(1, len(windows)-1):
    train_start = windows[0]
    train_end = windows[i]
    test_end = windows[i+1]
    
    #split data
    x_train,y_train,x_test,y_test = pp.single_train_test_set(df_final, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          train_start,
                                                          train_end, 
                                                          test_end, 
                                                          pred_time=pred_time)
    
    
    #run models
    for clf in models:
        modeltype = clf['type']
        func = clf['clf']
        if modeltype == 'Dtree':
            seed = clf['seed']
            for c in clf['criteria']:
                for d in clf['depth']:
                    for l in clf['min_leaf']:
                        #run model
                        info = f'criteria: {c}, depth: {d}, min_leaf: {l}, seed: {seed}'
                        scores = func(x_train, y_train, x_test, criteria = c, depth = d, min_leaf = l, seed=seed)
                        for t in thresholds:
                            a = accuracy_at_threshold(y_test, scores, t)
                            p = precision_at_threshold(y_test, scores, t)
                            r = recall_at_threshold(y_test, scores, t)
                            f1 = f1_at_threshold(y_test, scores, t)
                            auc = auc_roc(y_test_scores)
                            tmp = {'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                   'details': info, 'threshold': t, 'accuracy': a, 'precision': p, 'recall': r, 'f1': f1, 'auc': auc}
                            results.append(tmp)
                    
        elif modeltype == 'KNN':
            for n in clf['n']:
                for w in clf['weights']:
                    for d in clf['distance_metric']:
                        for p in clf['p']:
                            info = f'n: {n}, weights: {w}, distance: {d}, p: {p}'
                            scores = func(x_train, y_train, x_test, n = n, weights = w, distance_metric = d, p=p)
                            for t in thresholds:
                                a = accuracy_at_threshold(y_test, scores, t)
                                p = precision_at_threshold(y_test, scores, t)
                                r = recall_at_threshold(y_test, scores, t)
                                f1 = f1_at_threshold(y_test, scores, t)
                                auc = auc_roc(y_test_scores)
                                tmp = {'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                       'details': info, 'threshold': t, 'accuracy': a, 'precision': p, 'recall': r, 'f1': f1, 'auc': auc}
                                results.append(tmp)
                            
        elif modeltype == 'LR':
            seed = clf['seed']
            for p in clf['p']:
                for c in clf['c']:
                    for s in clf['solver']:
                        info = f'penalty: {p}, c: {c}, solver: {s}, seed: {seed}'
                        scores = func(x_train, y_train, x_test, p = p, c = c, solver = s, seed=seed)
                        for t in thresholds:
                            a = accuracy_at_threshold(y_test, scores, t)
                            p = precision_at_threshold(y_test, scores, t)
                            r = recall_at_threshold(y_test, scores, t)
                            f1 = f1_at_threshold(y_test, scores, t)
                            auc = auc_roc(y_test_scores)
                            tmp = {'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                   'details': info, 'threshold': t, 'accuracy': a, 'precision': p, 'recall': r, 'f1': f1, 'auc': auc}
                            results.append(tmp)
                        
        elif modeltype == 'SVM':
            seed = clf['seed']
            for p in clf['p']:
                for c in clf['c']:
                    info = f'penalty: {p}, c: {c}, seed: {seed}'
                    scores = func(x_train, y_train, x_test, p =p, c=c, seed=seed)
                    for t in svm_thresh:
                            a = accuracy_at_threshold(y_test, scores, t)
                            p = precision_at_threshold(y_test, scores, t)
                            r = recall_at_threshold(y_test, scores, t)
                            f1 = f1_at_threshold(y_test, scores, t)
                            auc = auc_roc(y_test_scores)
                            tmp = {'train_set_num': i, 'train_start': train_start, 'test_start': train_end,'type': modeltype, 
                                   'details': info, 'threshold': t, 'accuracy': a, 'precision': p, 'recall': r, 'f1': f1, 'auc': auc}
                            results.append(tmp)
                    
    

AttributeError: module 'ml_pipeline' has no attribute 'single_train_test_set'

In [None]:
#### Use temporal validation to split into training/test sets



x_train1,y_train1,x_test1,y_test1 = pp.single_train_test_set(df, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[1], 
                                                          windows[2], 
                                                          pred_time=60)

x_train2, y_train2, x_test2, y_test2 = pp.single_train_test_set(df, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[2], 
                                                          windows[3], 
                                                          pred_time=60)

x_train3, y_train3, x_test3, y_test3 = pp.single_train_test_set(df, 
                                                          feature_cols, 
                                                          label_col, 
                                                          split_col, 
                                                          windows[0],
                                                          windows[3], 
                                                          windows[4], 
                                                          pred_time=60)