## Imports 

In [1]:
from SETUP import *
from REDUCE_FEATURES import *
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
data_file = '/Users/Winnifred/Desktop/Capstone/ICPSR_20240_RAWDATA/DS0001/20240-0001-Data.tsv'
filenames = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_group_file_names.txt'
csv_root_path = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_name_data/'

In [3]:
suicidality_features = ['V01995', 'V01997', 'V01999', 'V02044', 'V02003', 'V02004', 'V02009', 'V02023', 'V02025', 'V02027', 'V02029', 'V02031', 'V02032', 'V02035', 'V02036', 'V02041']

In [4]:
set_inst = Setup(csv_root_path, filenames)
full_dict = set_inst.execute_setup()
reduce_inst = Reduce_Features(data_file, full_dict)
dirty_df = reduce_inst.execute_reduce()

In [5]:
df = dirty_df.drop(dirty_df[dirty_df['V01993'] == ' '].index)

In [6]:
## Turn strings to ints 
mask = {str(num): num for num in range(200000)}

In [7]:
mask[' '] = 0

In [8]:
mask['-9'] = -9

In [9]:
mask['-8'] = -8

In [10]:
mask[None] = 0

In [11]:
for idx, feature in enumerate(list(df)): 
    df[feature] = df[feature].map(mask)
    if idx % 250 == 0: 
        print('completed feature {} of 2110'.format(idx))

completed feature 0 of 2110
completed feature 250 of 2110
completed feature 500 of 2110
completed feature 750 of 2110
completed feature 1000 of 2110


In [12]:
df.drop(suicidality_features, axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,V01638,V01639,V01643,V01644,V01646,V01647,V01648,V01649,V01650,V01651,...,V08495,V08549,V07725,V07894,V08501,V08500,V08553,V07750,V07748,V07899
0,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
2,0,0,0,0,0,0,0,0,0,0,...,5,5,1,1,5,5,5,5,5,5
3,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
5,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
6,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5


In [15]:
df = df.dropna(axis=1)

In [16]:
df['V01993'].describe()

count    15890.000000
mean         4.456765
std          1.429194
min         -9.000000
25%          5.000000
50%          5.000000
75%          5.000000
max          5.000000
Name: V01993, dtype: float64

In [85]:
X_df = df.loc[:, df.columns != 'V01993']
y_df = df['V01993'].map({5:0, 1:1, -9:0, -8:0})

In [86]:
X = X_df.as_matrix()
y = y_df.as_matrix()

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [88]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(11123, 1192) (11123,)
(4767, 1192) (4767,)


In [27]:
def print_scores(y_test, predictions):
    print('Accuracy:', accuracy_score(y_test, predictions))
    print('Precision:', precision_score(y_test, predictions))
    print('Recall:', recall_score(y_test, predictions))
    return None

# Step 3  - MODELING


In [84]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

## Logistic Regression

In [89]:
logreg = linear_model.LogisticRegression()

In [90]:
model = logreg.fit(X_train, y_train)

In [91]:
y_pred = logreg.predict(X_test)

In [92]:
print('Score:', model.score(X_test, y_test))

Score: 0.885462555066


In [93]:
print_scores(y_test, y_pred)

Accuracy: 0.885462555066
Precision: 0.55625
Recall: 0.445


## Linear Regression

In [33]:
from sklearn.metrics import mean_squared_error, r2_score

In [94]:
regr = linear_model.LinearRegression()

In [95]:
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [96]:
y_pred = regr.predict(X_test)

In [97]:
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [ 0.0065542  -0.01353635 -0.00540481 ...,  0.01202367 -0.00104168
  0.00019176]


In [98]:
print("Mean squared error: {}".format(mean_squared_error(y_test, y_pred)))

Mean squared error: 2.0363213073607606e+17


In [99]:
print('Variance score: {}'.format(r2_score(y_test, y_pred)))

Variance score: -1.850810090605701e+18


## Random Forest

In [142]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve

In [133]:
clf = RandomForestClassifier(bootstrap= True,
 class_weight= None,
 criterion= 'gini',
 max_depth= 10,
 max_features= 'auto',
 max_leaf_nodes= None,
 min_impurity_split= 1e-07,
 min_samples_leaf= 1,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 n_estimators= 10,
 n_jobs= 1,
 oob_score= False,
 random_state= 0,
 verbose= 0,
 warm_start= False)

In [134]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [135]:
print(clf.feature_importances_)

[ 0.00016294  0.00026923  0.00053686 ...,  0.00029835  0.0004448
  0.00012265]


In [136]:
y_pred = clf.predict(X_test)

In [137]:
clf.score(X,y)

0.92517306482064188

In [138]:
print_scores(y_test, y_pred)

Accuracy: 0.906230333543
Precision: 0.742857142857
Recall: 0.39


In [139]:
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted No Ideation', 'Predicted Ideation'],
    index=['True No Ideation', 'True Ideation']
)

Unnamed: 0,Predicted No Ideation,Predicted Ideation
True No Ideation,4086,81
True Ideation,366,234


In [148]:
train_pred = model.predict(X_train)

In [149]:
print_scores(y_train, train_pred)

NameError: name 'train' is not defined