## Imports 

In [1]:
from SETUP import *
from REDUCE_FEATURES import *
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.model_selection import train_test_split

In [2]:
data_file = '/Users/Winnifred/Desktop/Capstone/ICPSR_20240_RAWDATA/DS0001/20240-0001-Data.tsv'
filenames = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_group_file_names.txt'
csv_root_path = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_name_data/'

In [3]:
suicidality_features = ['V01995', 'V01997', 'V01999', 'V02044', 'V02003', 'V02004', 'V02009', 'V02023', 'V02025', 'V02027', 'V02029', 'V02031', 'V02032', 'V02035', 'V02036', 'V02041']

In [4]:
set_inst = Setup(csv_root_path, filenames)
full_dict = set_inst.execute_setup()
reduce_inst = Reduce_Features(data_file, full_dict)
dirty_df = reduce_inst.execute_reduce()

In [5]:
df = dirty_df.drop(dirty_df[dirty_df['V01993'] == ' '].index)

In [6]:
## Turn strings to ints 
mask = {str(num): num for num in range(200000)}

In [7]:
mask[' '] = 0

In [8]:
mask['-9'] = -9

In [9]:
mask['-8'] = -8

In [10]:
mask[None] = 0

In [11]:
for idx, feature in enumerate(list(df)): 
    df[feature] = df[feature].map(mask)
    if idx % 250 == 0: 
        print('completed feature {} of 2110'.format(idx))

completed feature 0 of 2110
completed feature 250 of 2110
completed feature 500 of 2110
completed feature 750 of 2110
completed feature 1000 of 2110


In [12]:
df.drop(suicidality_features, axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,V01638,V01639,V01643,V01644,V01646,V01647,V01648,V01649,V01650,V01651,...,V08495,V08549,V07725,V07894,V08501,V08500,V08553,V07750,V07748,V07899
0,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
2,0,0,0,0,0,0,0,0,0,0,...,5,5,1,1,5,5,5,5,5,5
3,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
5,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
6,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5


In [14]:
df = df.dropna(axis=1)

In [15]:
df['V01993'].describe()

count    15890.000000
mean         4.456765
std          1.429194
min         -9.000000
25%          5.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: V01993, dtype: float64

In [16]:
X_df = df.loc[:, df.columns != 'V01993']
y_df = df['V01993'].map({5:0, 1:1, -9:0, -8:0})

In [17]:
X = X_df.as_matrix()
y = y_df.as_matrix()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(11123, 1192) (11123,)
(4767, 1192) (4767,)


In [20]:
def print_scores(y_test, predictions):
    print('Accuracy:', accuracy_score(y_test, predictions))
    print('Precision:', precision_score(y_test, predictions))
    print('Recall:', recall_score(y_test, predictions))
    return None

# Step 3  - MODELING


In [21]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve

In [23]:
clf = RandomForestClassifier(bootstrap= True,
 class_weight= None,
 criterion= 'gini',
 max_depth= 10,
 max_features= 'auto',
 max_leaf_nodes= None,
 min_impurity_split= 1e-07,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 n_estimators= 10,
 n_jobs= 1,
 oob_score= False,
 random_state= 0,
 verbose= 0,
 warm_start= False)

In [24]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [25]:
print(clf.feature_importances_)

[ 0.00016294  0.00026923  0.00053686 ...,  0.00029835  0.0004448
  0.00012265]


In [26]:
y_pred = clf.predict(X_test)

In [27]:
clf.score(X,y)

0.92517306482064188

In [28]:
print_scores(y_test, y_pred)

Accuracy: 0.906230333543
Precision: 0.742857142857
Recall: 0.39


In [29]:
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted No Ideation', 'Predicted Ideation'],
    index=['True No Ideation', 'True Ideation']
)

Unnamed: 0,Predicted No Ideation,Predicted Ideation
True No Ideation,4086,81
True Ideation,366,234


In [None]:
train_pred = model.predict(X_train)
print_scores(y_train, train_pred)

In [None]:
print_scores(y_train, train_pred)

## CV Random Forest

In [30]:
from sklearn import svm
from sklearn import cross_validation



In [31]:
len_train = len(X_train)

In [32]:
model = RandomForestClassifier(n_estimators=100)
cv = cross_validation.KFold(len_train, n_folds=10)
cv

sklearn.cross_validation.KFold(n=11123, n_folds=10, shuffle=False, random_state=None)

In [None]:
results = []
# "Error_function" can be replaced by the error function of your analysis
for traincv, testcv in cv:
        probas = model.fit(X_train[traincv], y_train[traincv]).predict_proba(X_train[testcv])
        results.append(probas)

In [52]:
len(results)

10

In [None]:
new_results = []
for fold in results: 
    results.append(fold[:1113])

In [None]:
for fold in new_results:
    print(len(fold))

In [None]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores                                              

## ROC Curve 

In [None]:
from sklearn.metrics import roc_curve, auc
import numpy as np

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp


In [None]:
# LOGISTIC REGRESSION SCORES 
# logreg_y = [0,1]
# logreg_scores = 

In [None]:
# RANDOM FOREST SCORES 
# rf_y = [0,1]
# rf_scores = 

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
for i in range(2):
    fpr, tpr, _ = roc_curve(y_test[:,i], y_score[:,i]) 
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
# fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)

In [None]:
plt.figure()
plt.plot(fpr[2], tpr[2], color='darkorange')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', {'fontsize': 15})
plt.ylabel('True Positive Rate', {'fontsize': 15})
plt.title('ROC Comparison', {'fontsize': 20})
plt.legend(loc="lower right")
plt.show()