In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.DataFrame.from_csv('36361-0001-Data.tsv', sep='\t')

In [None]:
#Select markers for Age of first use of variety of drugs, flag for having tried the drug.
cols = ['IRCIGAGE',
 'IRCDUAGE',
 'IRCGRAGE',
 'IRSLTAGE',
 'IISLTAGE',
 'IRCHWAGE',
 'IRSNFAGE',
 'IRALCAGE',
 'IRMJAGE',
 'IRCOCAGE',
 'IRCRKAGE',
 'IRHERAGE',
 'IRHALAGE',
 'IRLSDAGE',
 'IRPCPAGE',
 'IRECSAGE',
 'IRINHAGE',
 'IRANLAGE',
 'IROXYAGE',
 'IRTRNAGE',
 'IRSTMAGE',
 'IRMTHAGE',
 'IRSEDAGE',
 'AGE2',
 'IRSEX',
 'CIGFLAG',
 'SMKFLAG',
 'TOBFLAG',
 'ALCFLAG',
 'MRJFLAG',
 'COCFLAG',
 'CRKFLAG',
 'HERFLAG',
 'HALFLAG',
 'LSDFLAG',
 'PCPFLAG',
 'ECSFLAG',
 'INHFLAG',
 'ANLFLAG',
 'OXYFLAG',
 'TRQFLAG',
 'STMFLAG',
 'CPNSTMFG',
 'MTHFLAG',
 'CPNMTHFG',
 'SEDFLAG',
 'PSYFLAG2',
 'CPNPSYFG',
 'SUMFLAG',
 'MJOFLAG',
 'IEMFLAG',
 'CDUFLAG',
 'HEALTH',
 'COMBATPY',
 'SERVICE', 
 'NOMARR2',
 'LANGVER',
 'POVERTY2',
 'INCOME',
 'GOVTPROG',
 'IMOTHER',
 'EDFAM18',
 'IFATHER', 
 'DRKSUM',
 'CABNGAGE',
 'DEPRSLIF',
 'ANXDLIF']

In [None]:
#This is the regression data
X = df[cols]
y = df['DEPNDILL']

In [None]:
X['YSTART'] = (X["IRALCAGE"] < 15) & (X["IRMJAGE"] < 15)
X['YANY'] = (X['IRALCAGE']<15) or (X["IRMJAGE"] < 15) or (X['IRCIGAGE'] < 15) or (X['IRCDUAGE'] <15) or (X['IRCGRAGE']<15) or (X['IRSLTAGE']<15) or (X['IISLTAGE'] <15) or (X['IRCHWAGE']<15) or (X['IRSNFAGE']<15) or (X['IRCOCAGE']<15) or (X['IRCRKAGE']<15) or (X['IRHERAGE']<15) or (X['IRHALAGE']<15) or (X['IRLSDAGE']<15) or (X['IRPCPAGE']<15) or (X['IRECSAGE']<15) or (X['IRINHAGE']<15) or (X['IRANLAGE']<15) || (X['IROXYAGE']<15) or (X['IRTRNAGE']<15) or (X['IRSTMAGE']<15) or (X['IRMTHAGE']<15) or (X['IRSEDAGE']<15)


In [None]:
#What does the data look like?
y.value_counts()

In [None]:
X.shape

In [None]:
X.head()

In [None]:
#functions for plotting roc_curves
def plot_multi_roc(model, model_r, model_sm, name):
    #score all data
    y_score = model.predict_proba(X_test)[:,1]
    fpr, tpr,_ = roc_curve(y_test, y_score)
    #score undersampled data
    y_r_score=model_r.predict_proba(X_test)[:,1]
    fpr_r, tpr_r,_ = roc_curve(y_test, y_r_score)
    #score oversampled data
    y_sm_score = model_sm.predict_proba(X_test)[:,1]
    fpr_sm, tpr_sm,_ = roc_curve(y_test, y_sm_score)

    #roc_auc = auc(fpr, tpr)

    plt.figure()
    # Plotting our Baseline..
    plt.title("MODEL: "+name)
    plt.plot([0,1],[0,1])
    plt.plot(fpr,tpr, label = "all data")
    plt.plot(fpr_r, tpr_r, label = "resampled")
    plt.plot(fpr_sm, tpr_sm, label = "smote")
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc='best')

def plot_roc(model, name):
    #score all data
    y_score = model.predict_proba(X_test2)[:,1]
    fpr, tpr,_ = roc_curve(y_test2, y_score)

    roc_auc = auc(fpr, tpr)

    plt.figure()
    # Plotting our Baseline..
    plt.title("MODEL: "+name)
    plt.plot([0,1],[0,1])
    plt.plot(fpr,tpr, label = "all data")
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc='best')
    

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=4321)


In [None]:
#perform regression on the selected dataset as-is.
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=4321)
lr = LogisticRegression()
gnb = GaussianNB()
#svc = SVC()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()
knn = KNeighborsClassifier()
dtc.fit(X_train, y_train)
rfc.fit(X_train, y_train)
lr.fit(X_train, y_train)
gnb.fit(X_train, y_train)
knn.fit(X_train, y_train)

In [None]:
#cross val for all of the models:
model_list = [lr, gnb, dtc, rfc, knn]
model_names = ['Logistic Regressor', 'Gaussian Naive Bayes', 'Decision Tree', 'Random Forest', 'KNN']
for m, n in zip(model_list, model_names):
    print(n)
    print(cross_val_score(m, X_train, y_train, scoring='recall'))


In [None]:
C_param_range = [0.001,0.01,0.1,1,10,100]
scores = []
for c in C_param_range:
    lr = LogisticRegression(C = c)
    print(c)
    print(cross_val_score(lr, X_train, y_train, scoring='recall'))
    

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
# Build a classification task using 3 informative features


rfc_grid = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [100, 300],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc_grid, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)

In [None]:
cross_val_score(CV_rfc, X_train, y_train, scoring='recall')

In [None]:
#oversample the underrepresented data and try the model again

sm = SMOTE(random_state=12, ratio = 1.0)
X_sm_train, y_sm_train = sm.fit_sample(X_train, y_train)

In [None]:
X_sm_train.shape

In [None]:

lr_sm = LogisticRegression()
gnb_sm = GaussianNB()
#svc = SVC()
dtc_sm = DecisionTreeClassifier()
rfc_sm = RandomForestClassifier(max_features= 'sqrt', n_estimators= 100)
knn_sm = KNeighborsClassifier()
dtc_sm.fit(X_sm_train, y_sm_train)
rfc_sm.fit(X_sm_train, y_sm_train)
lr_sm.fit(X_sm_train, y_sm_train)
gnb_sm.fit(X_sm_train, y_sm_train)
knn_sm.fit(X_sm_train, y_sm_train)

In [None]:
#cross val for all of the models:
model_list = [lr_sm, gnb_sm, dtc_sm, rfc_sm, knn_sm]
model_names = ['Logistic Regressor', 'Gaussian Naive Bayes', 'Decision Tree', 'Random Forest', 'KNN']
for m, n in zip(model_list, model_names):
    print(n)
    print(cross_val_score(m, X_sm_train, y_sm_train, scoring='recall'))



In [None]:
#Now let's try undersampling the oversampled portion:
rus = RandomUnderSampler(return_indices=True)
X_rus, y_rus, idx_resampled = rus.fit_sample(X_train, y_train)

In [None]:
X_rus.shape

In [None]:
lr_rus = LogisticRegression()
gnb_rus = GaussianNB()
#svc = SVC()
dtc_rus = DecisionTreeClassifier()
rfc_rus = RandomForestClassifier(max_features= 'sqrt', n_estimators= 100)
knn_rus = KNeighborsClassifier()
dtc_rus.fit(X_rus, y_rus)
rfc_rus.fit(X_rus, y_rus)
lr_rus.fit(X_rus, y_rus)
gnb_rus.fit(X_rus, y_rus)
knn_rus.fit(X_rus, y_rus)

In [None]:
#cross val for all of the models:
model_list = [lr_rus, gnb_rus, dtc_rus, rfc_rus, knn_rus]
model_names = ['Logistic Regressor', 'Gaussian Naive Bayes', 'Decision Tree', 'Random Forest', 'KNN']
for m, n in zip(model_list, model_names):
    print(n)
    print(cross_val_score(m, X_rus, y_rus, scoring='recall'))



In [None]:
#SMOTE data seems to have the best recall. I will use that for modeling.  
#Test the logistic regressor params again but with smote data this time

C_param_range = [0.001,0.01,0.1,1,10,100]
scores = []
for c in C_param_range:
    lr = LogisticRegression(C = c)
    print(c)
    print(cross_val_score(lr, X_sm_train, y_sm_train, scoring='recall'))

In [None]:
#Test KNN with parameters

# creating odd list of K for KNN
myList = list(range(1,50))

# subsetting just the odd ones
neighbors = filter(lambda x: x % 2 != 0, myList)

# empty list that will hold cv scores
cv_scores = []

for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_sm_train, y_sm_train, cv=10, scoring='%precision')
    cv_scores.append(scores.mean())

In [None]:
cv_scores.index(max(cv_scores))

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
cross_val_score(knn, X_sm_train, y_sm_train, cv=10, scoring='recall')

In [None]:
# .99 feels too good to be true.  I think I will stick with Logistic but also look at random forest.

In [None]:
#best models are Logistic Regressor and Random Forest.  Plotting the two roc curves against test data.
lr_sm
rfc_sm


#score all data
y_score_lr = lr_sm.predict_proba(X_test)[:,1]
fpr, tpr,_ = roc_curve(y_test, y_score_lr)
y_score_rfc = rfc_sm.predict_proba(X_test)[:,1]
fpr_rf, tpr_rf,_ = roc_curve(y_test, y_score_rfc)

roc_auc = auc(fpr, tpr)

plt.figure()
# Plotting our Baseline..
plt.title("ROC curve")
plt.plot([0,1],[0,1])
plt.plot(fpr,tpr, label = "Logistic Regressor")
plt.plot(fpr_rf,tpr_rf, label = "Random Forest")
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='best')

In [None]:
#They are very similar.  I will work with Logistic.
y_pred = rfc_sm.predict(X_test)
#y_pred_r = rfc_sm.predict(X_test)

In [None]:
outcome = []
for y, z in zip(y_pred, y_test):
    if y == 1 & z == 1:
        outcome.append('tpos')
    elif y== 1 & z == 0:
        outcome.append('fpos')
    elif y == 0 & z == 0:
        outcome.append('tneg')
    elif y == 0 & z == 1:
        outcome.append('fneg')
        


In [None]:
outcome.count('fneg')

In [None]:
lr = LogisticRegression(C = 10, class_weight='balanced')

In [None]:
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
outcome = []
for y, z in zip(y_pred, y_test):
    if y == 1 & z == 1:
        outcome.append('tpos')
    elif y== 1 & z == 0:
        outcome.append('fpos')
    elif y == 0 & z == 0:
        outcome.append('tneg')
    elif y == 0 & z == 1:
        outcome.append('fneg')
        

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred_k = knn.predict(X_test)

In [None]:
del X_test['predicted']

In [None]:
lr_sm.fit(X_sm_train, y_sm_train)
y_pred_sm = lr_sm.predict(X_test)

In [None]:
predicted_data = X_test
predicted_data['predicted'] = outcome

In [None]:
false_pos = predicted_data[predicted_data['predicted'] == 1]

In [None]:
false_pos[false_pos['AGE2'] < 12].shape

In [None]:
sorted(list(zip(rfc.feature_importances_, X.columns)), reverse=True)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
param_test1 = {'n_estimators':list(range(50,200,25))}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)

In [None]:
gsearch1.best_estimator_

In [None]:
gbc = gsearch1.best_estimator_

In [None]:
gbc.fit(X_train, y_train)

In [None]:
print(cross_val_score(gbc, X_train, y_train, scoring='recall'))

In [None]:
print(cross_val_score(gbc, X_train, y_train, scoring='precision'))

In [None]:
print(cross_val_score(gbc, X_train, y_train, scoring='accuracy'))

In [None]:
print(cross_val_score(gbc, X_sm_train, y_sm_train, scoring='recall'))
print(cross_val_score(gbc, X_sm_train, y_sm_train, scoring='precision'))
print(cross_val_score(gbc, X_sm_train, y_sm_train, scoring='accuracy'))

In [None]:
print(cross_val_score(gbc, X_rus, y_rus, scoring='recall'))
print(cross_val_score(gbc, X_rus, y_rus, scoring='precision'))
print(cross_val_score(gbc, X_rus, y_rus, scoring='accuracy'))

In [None]:
gbc.fit(X_sm_train, y_sm_train)

In [None]:
sorted(list(zip(gbc.feature_importances_, X.columns)), reverse=True)

In [None]:
print(cross_val_score(rfc, X_train, y_train, scoring='recall'))
print(cross_val_score(rfc, X_train, y_train, scoring='precision'))
print(cross_val_score(rfc, X_train, y_train, scoring='accuracy'))

In [None]:
print(cross_val_score(rfc, X_sm_train, y_sm_train, scoring='recall'))
print(cross_val_score(rfc, X_sm_train, y_sm_train, scoring='precision'))
print(cross_val_score(rfc, X_sm_train, y_sm_train, scoring='accuracy'))

In [None]:
print(cross_val_score(rfc, X_rus, y_rus, scoring='recall'))
print(cross_val_score(rfc, X_rus, y_rus, scoring='precision'))
print(cross_val_score(rfc, X_rus, y_rus, scoring='accuracy'))

In [None]:
rfc.fit(X_rus, y_rus)

In [None]:
print(cross_val_score(lr, X_train, y_train, scoring='recall'))
print(cross_val_score(lr, X_train, y_train, scoring='precision'))
print(cross_val_score(lr, X_train, y_train, scoring='accuracy'))

In [None]:
print(cross_val_score(lr, X_sm_train, y_sm_train, scoring='recall'))
print(cross_val_score(lr, X_sm_train, y_sm_train, scoring='precision'))
print(cross_val_score(lr, X_sm_train, y_sm_train, scoring='accuracy'))

In [None]:
print(cross_val_score(lr, X_rus, y_rus, scoring='recall'))
print(cross_val_score(lr, X_rus, y_rus, scoring='precision'))
print(cross_val_score(lr, X_rus, y_rus, scoring='accuracy'))

In [None]:
sm2 = SMOTE(random_state=12, ratio = 0.3)
X_sm2_train, y_sm2_train = sm2.fit_sample(X_train, y_train)

In [None]:
print(cross_val_score(lr, X_train, y_train, scoring='recall'))
print(cross_val_score(lr, X_train, y_train, scoring='precision'))
print(cross_val_score(lr, X_train, y_train, scoring='accuracy'))
print(cross_val_score(lr, X_train, y_train, scoring='f1'))

In [None]:
cross_val_score(lr, X_train, y_train, scoring='recall').mean()

In [None]:
cross_val_score(lr, X_train, y_train, scoring='precision').mean()

In [None]:
lr.fit(X_train, y_train)
pig = lr.predict(X_test)
len(pig)

In [None]:
cross_val_score(lr, X_train, y_train, scoring='recall').mean()

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig

In [None]:
cfm = confusion_matrix(y_test, pig)

In [None]:
tn, fp, fn, tp = cfm.ravel()

In [None]:
print(tn, fp, fn, tp)

In [None]:
sorted(list(zip(pd.Series(lr.coef_[0]), X.columns)), reverse=True)

In [None]:
y_pred_vals = lr.predict(X_test)

In [None]:
sum(y_pred_vals)

In [None]:
predicted_addicts = X_test[y_pred_vals > y_test]

In [None]:
predicted_addicts.sort_index(inplace=True)

In [None]:
predicted_addicts[predicted_addicts.AGE2 < 10].AGE2.value_counts()

In [None]:
predicted_addicts[predicted_addicts.AGE2 >= 10].AGE2.value_counts()

In [None]:
predicted_adict_proba = lr.predict_proba(predicted_addicts)

In [None]:
predicted_adict_proba.sort()

In [None]:
#decision threshold
type(predicted_adict_proba)

In [None]:
predicted_adict_proba[:, 1]

In [None]:
predicted_addicts['probability'] = predicted_adict_proba[:, 1]

In [None]:
bins = [.50, .60, .70, .80, .90, 1.00]
labels = [50, 60, 70, 80, 90]

In [None]:
predicted_addicts['prob_group'] = pd.cut(predicted_addicts['probability'], bins, labels=labels)

In [None]:
predicted_addicts['prob_group']

In [None]:
predicted_addicts['prob_group'].value_counts()

In [None]:
predicted_addicts[predicted_addicts.AGE2 < 8].prob_group.value_counts()

In [None]:
predicted_addicts[(predicted_addicts.AGE2 >= 8) & (predicted_addicts.AGE2 <14)].prob_group.value_counts()

In [None]:
predicted_addicts[(predicted_addicts.AGE2 > 13)].prob_group.value_counts()

In [None]:
import seaborn as sns

In [None]:
y_score = lr.predict_proba(X_test)[:,1]
fpr, tpr,_ = roc_curve(y_test, y_score)

roc_auc = auc(fpr, tpr)

fig, ax = plt.subplots()
ax.grid(linestyle='-', linewidth='0.5', color='white')
ax.set_facecolor('white')
# Plotting our Baseline..
plt.title("Logistic Regression ROC")
plt.plot([0,1],[0,1])
plt.plot(fpr,tpr, label = "all data")
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='best')