In [132]:
%reset -f

In [133]:
# import session
import numpy as np
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import neighbors, linear_model, metrics, tree, cross_validation, svm
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score, StratifiedKFold as KFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import ExtraTreesRegressor as XTR
from sklearn.ensemble import RandomForestClassifier as RF

In [134]:
df_feature = pd.read_csv("./feature_matrix_model2_stage1.csv")
df_feature['id'] = df_feature['Unnamed: 0'].apply(lambda x: x.split('.')[0])

df_labels_1 =pd.read_csv('/home/lin/data/stage1_labels.csv')
df_labels_2 = pd.read_csv('/home/lin/data/stage1_solution.csv')
df_labels_2 = df_labels_2.drop(['Usage'],1)
df_labels = df_labels_1.append(df_labels_2)
df = pd.merge(left=df_feature, right=df_labels,how='outer',on='id').dropna()
print (df.shape)
df.head()

(1434, 17)


Unnamed: 0.1,Unnamed: 0,max_malig,max_spiculation,max_lobulation,max_diameter,xsd_malig,xsd_spiculation,xmax_lobulation,xsd_diameter,loc_from_malig_x,loc_from_malig_y,loc_from_malig_z,std_locs_x,std_locs_y,std_locs_z,id,cancer
0,d777a77cc7a2ec2f1eed68799cc9075c.npz,0.521839,0.373247,0.372914,0.34916,0.103659,0.076975,0.372914,0.061591,0.539286,0.528571,0.434028,0.244258,0.163056,0.242655,d777a77cc7a2ec2f1eed68799cc9075c,1
1,1631637f08f27347e8f23d7a0e18c100.npz,0.531258,0.37668,0.376645,0.349927,0.122501,0.079592,0.376645,0.062194,0.088889,0.172222,0.205128,0.243736,0.15956,0.216008,1631637f08f27347e8f23d7a0e18c100,0
2,9065f2b133129c5747d42db18a424749.npz,0.52844,0.375415,0.376739,0.351881,0.114865,0.077483,0.376739,0.059155,0.3,0.339394,0.700637,0.231711,0.155614,0.217998,9065f2b133129c5747d42db18a424749,1
3,e3a9a6f8d21c6c459728066bcf18c615.npz,0.527394,0.379579,0.37645,0.354797,0.096239,0.078583,0.37645,0.062853,0.214035,0.385965,0.462963,0.233297,0.183184,0.207843,e3a9a6f8d21c6c459728066bcf18c615,0
4,f39a1e54d79731e4417aa8159d19b7d4.npz,0.533726,0.376004,0.377995,0.352379,0.110376,0.077679,0.377995,0.060632,0.688235,0.094118,0.091954,0.249184,0.183342,0.214586,f39a1e54d79731e4417aa8159d19b7d4,0


In [135]:
x_cols = df.drop(['id', 'cancer', 'Unnamed: 0'],1).columns  # using all the features
# x_cols=['max_malig','max_spiculation','max_lobulation','max_diameter','xsd_malig', 'xsd_spiculation', 'xmax_lobulation','xsd_diameter']
X = df.loc[:][x_cols].values
y = df.loc[:]['cancer'].values

# Logistic regression

In [136]:
print ('Logistic Regression')
cv = cross_validation.KFold(X.shape[0],n_folds = 20,random_state=42)
X = StandardScaler().fit_transform(X)
clf = linear_model.LogisticRegression(class_weight={0:0.22,1:0.78},penalty='l1')

''' 
score_val_roc = cross_val_score(clf,X,y,cv=cv, scoring="roc_auc")  #scoring="recall","accuracy","f1"
print ("\nscore_val_roc: ", score_val_roc, score_val_roc.mean())

'''
y_pred = cross_val_predict(clf, X, y, cv=cv, method='predict',n_jobs=-1)
y_pred_prob = cross_val_predict(clf, X, y, cv=cv, method='predict_proba',n_jobs=-1)[:,1]

conf_mat = confusion_matrix(y,y_pred)
D = conf_mat[0][0]    # TP
B = conf_mat[0][1]    # FP
C = conf_mat[1][0]    # FN
A = conf_mat[1][1]    # TN

print ("confusion_matrix from sklearn: \n", conf_mat)
print (classification_report(y, y_pred, target_names=["No Cancer", "Cancer"]))

print("\nLogloss",log_loss(y, y_pred_prob))
print ("Specificity: ", D/(B+D))
print ("Recall: ", A/(A+C))
print ("Accuracy: ", (A+D)/(A+B+C+D))

Logistic Regression
confusion_matrix from sklearn: 
 [[306 765]
 [ 96 267]]
             precision    recall  f1-score   support

  No Cancer       0.76      0.29      0.42      1071
     Cancer       0.26      0.74      0.38       363

avg / total       0.63      0.40      0.41      1434


Logloss 0.738163666717
Specificity:  0.285714285714
Recall:  0.735537190083
Accuracy:  0.399581589958


# Logistic Regression with grid search for C

In [140]:
cv = cross_validation.KFold(X.shape[0],n_folds = 20,random_state=42)

# lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
Cs = [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5]
parameters = {'C': Cs} 

acc = np.zeros((20,))
i=0
yhat = y.copy()
y_pred_prob = np.zeros(len(y))

for train_index, test_index in cv:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    clf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
    # We can change the scoring "average_precision", "recall", "f1"
    grd = GridSearchCV(clf, parameters)  
    grd.fit(X_train,y_train.ravel())
   
    yhat[test_index] = grd.predict(X_test)
    y_pred_prob[test_index] = grd.predict_proba(X_test)[:,1]
#     print (str(grd.best_params_))
    acc[i] = metrics.accuracy_score(yhat[test_index], y_test)
    i=i+1
    
    conf_mat = confusion_matrix(y,y_pred)
    D = conf_mat[0][0]    # TP
    B = conf_mat[0][1]    # FP
    C = conf_mat[1][0]    # FN
    A = conf_mat[1][1]    # TN
    
# print ('Mean accuracy: '+ str(np.mean(acc)))
print ("\nconfusion_matrix from sklearn: \n", conf_mat)
print (classification_report(y, y_pred, target_names=["No Cancer", "Cancer"]))
print("logloss",log_loss(y, y_pred_prob))

print ("Specificity: ", D/(B+D))
print ("Recall: ", A/(A+C))
print ("Accuracy: ", (A+D)/(A+B+C+D))


confusion_matrix from sklearn: 
 [[306 765]
 [ 96 267]]
             precision    recall  f1-score   support

  No Cancer       0.76      0.29      0.42      1071
     Cancer       0.26      0.74      0.38       363

avg / total       0.63      0.40      0.41      1434

logloss 0.69314718056
Specificity:  0.285714285714
Recall:  0.735537190083
Accuracy:  0.399581589958


In [None]:
x_cols = df.drop(['id', 'cancer', 'Unnamed: 0'],1).columns  # using all the features
# x_cols=['max_malig','max_spiculation','max_lobulation','max_diameter','xsd_malig', 'xsd_spiculation', 'xmax_lobulation','xsd_diameter']
X = df.loc[:][x_cols].values
y = df.loc[:]['cancer'].values
X = StandardScaler().fit_transform(X)

lr = linear_model.LogisticRegression(penalty='l1',class_weight="balanced")
Cs = [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.5]
param_grid = {'C': Cs}
grd = GridSearchCV(lr,param_grid, cv=cv, verbose=1, n_jobs=-1)

Yhat = cross_val_predict(grd , X, y, cv=cv, method='predict',n_jobs=3)
print (Yhat[:10])
# Yh = cross_val_predict(grid, X, y, cv=cv, method='predict_proba',n_jobs=-1)[:,1]
# print (Yh[:10])

# RF

In [118]:
print ('\nRandom Forest')

cv = cross_validation.KFold(X.shape[0],n_folds = 20,random_state=42)
X = StandardScaler().fit_transform(X)
clf = RF(n_estimators=100,class_weight="balanced", n_jobs=-1)

y_pred = cross_val_predict(clf, X, y, cv=cv, method='predict',n_jobs=-1)
y_pred_prob = cross_val_predict(clf, X, y, cv=cv, method='predict_proba',n_jobs=-1)[:,1]

conf_mat = confusion_matrix(y,y_pred)
D = conf_mat[0][0]    # TP
B = conf_mat[0][1]    # FP
C = conf_mat[1][0]    # FN
A = conf_mat[1][1]    # TN

print ("confusion_matrix from sklearn: \n", conf_mat)
print (classification_report(y, y_pred, target_names=["No Cancer", "Cancer"]))

print("\nLogloss",log_loss(y, y_pred_prob))
print ("Specificity: ", D/(B+D))
print ("Recall: ", A/(A+C))
print ("Accuracy: ", (A+D)/(A+B+C+D))


Random Forest
confusion_matrix from sklearn: 
 [[1069    2]
 [ 360    3]]
             precision    recall  f1-score   support

  No Cancer       0.75      1.00      0.86      1071
     Cancer       0.60      0.01      0.02       363

avg / total       0.71      0.75      0.64      1434


Logloss 0.589166271041
Specificity:  0.998132586368
Recall:  0.00826446280992
Accuracy:  0.747559274756
