## Imports 

In [1]:
from SETUP import *
from REDUCE_FEATURES import *
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
data_file = '/Users/Winnifred/Desktop/Capstone/ICPSR_20240_RAWDATA/DS0001/20240-0001-Data.tsv'
filenames = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_group_file_names.txt'
csv_root_path = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_name_data/'

In [3]:
set_inst = Setup(csv_root_path, filenames)
full_dict = set_inst.execute_setup()
reduce_inst = Reduce_Features(data_file, full_dict)
dirty_df = reduce_inst.execute_reduce()

In [4]:
def get_group(feature, full_dict):
    for akey, apair in full_dict.items():
        for bkey, bpair in apair.items(): 
            if feature == bkey: 
                return akey

In [5]:
def get_descriptions(feature, full_dict):
    for akey, apair in full_dict.items():
        for bkey, bpair in apair.items(): 
            if feature == bkey: 
                return bpair[0]   

# Step 1
## Understand Landscape of DataFrame

In [None]:
dirty_df.head()

In [None]:
description = dirty_df.describe()
# these features (below) are: 
# sampling error stratum, region of country, sex, work status 3 categories, total height-inches, weight in pounds

In [None]:
description

Ok. This dataframe has all of the "universal" features and is limited to those whose input range was between -9 and 5. Typical unique values in a column will be ['-9', '-8', '1', '5', ' ']. 

We are starting out with 20,013 data points and 1,219 features. 

In describe, it only gives me six features. I'd like to see what those are: 

In [None]:
demo_features = list(description)

In [None]:
for feature in demo_features:
    print(feature, get_descriptions(feature, full_dict))

In [None]:
demographic_features = ['V08992', 'V09036', 'V08172', 'V07306']

# Step 2
## 2.A - Create DF with Desired Features

In [120]:
def create_basic_df(dirty_df): 
    df = pd.DataFrame()
    df['Region'] = dirty_df['V08992']
    df['Gender'] = dirty_df['V09036']
    df['Yrs of Edu'] = dirty_df['V08172']
    df['Age'] = dirty_df['V07306']
    df['Work Status'] = dirty_df['V09154']
    df['Physical Health'] = dirty_df['V00233']
    df['Mental Health'] = dirty_df['V00234']
    df['Suicide Ideation'] = dirty_df['V01993']
    return df 

In [7]:
df = create_desired_df(dirty_df)

In [None]:
df.head()

## Remove rows where invalid for Suicide Ideation

In [8]:
df = df.drop(df[df['Suicide Ideation'] == ' '].index)

In [None]:
df.head()

In [9]:
## Turn strings to ints 
mask = {str(num): num for num in range(150)}

In [None]:
type(df['Work Status'].values[50])

In [10]:
# features that need to be turned from str to int
feat_to_convert = ['Work Status', 'Physical Health', 'Mental Health', 'Suicide Ideation']
for feature in feat_to_convert: 
    df[feature] = df[feature].map(mask)

In [11]:
dict_conv = {'Region': float, 'Gender': float, 'Yrs of Edu': float, 'Age': float}

In [12]:
df = df.astype(dict_conv)

In [13]:
df = df.dropna()

In [None]:
df.describe()

In [14]:
df['Suicide Ideation'].isnull().values.any()

False

## Let's plot for whatever reason

In [None]:
import seaborn as sns
sns.set(style='ticks')

In [None]:
sns.pairplot(df, hue='Suicide Ideation')
plt.show()

# Step 3  
## Logistic Regression

In [15]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [80]:
X_df = df.loc[:, df.columns != 'Suicide Ideation']
y_df = df['Suicide Ideation']

In [81]:
y_df = y_df.map({5:0, 1:1})

In [82]:
X = X_df.as_matrix()
y = y_df.as_matrix()

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [84]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(6829, 7) (6829,)
(2928, 7) (2928,)


In [36]:
logreg = linear_model.LogisticRegression()

In [37]:
model = logreg.fit(X_train, y_train)

In [38]:
y_pred = logreg.predict(X_test)

In [39]:
print('Score:', model.score(X_test, y_test))

Score: 0.882513661202


In [40]:
def print_scores(y_test, predictions):
    print('Accuracy:', accuracy_score(y_test, predictions))
    print('Precision:', precision_score(y_test, predictions))
    print('Recall:', recall_score(y_test, predictions))
    return None

In [41]:
print_scores(y_test, y_pred)

Accuracy: 0.882513661202
Precision: 0.5
Recall: 0.00290697674419


In [44]:
cnf_matrix = confusion_matrix(y, y)

In [45]:
cnf_matrix

array([[8601,    0],
       [   0, 1156]])

In [49]:
1156/9757

0.1184790406887363

In [50]:
8601/9757

0.8815209593112637

## Linear Regression

In [59]:
from sklearn.metrics import mean_squared_error, r2_score


In [52]:
regr = linear_model.LinearRegression()

In [53]:
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [54]:
y_pred = regr.predict(X_test)

In [55]:
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [ 0.00162976  0.01893408  0.00171011 -0.00236347  0.0148895   0.00887987
  0.05425536]


In [60]:
print("Mean squared error: {}".format(mean_squared_error(y_test, y_pred)))

Mean squared error: 0.0999471493535944


In [63]:
print('Variance score: {}'.format(r2_score(y_test, y_pred)))

Variance score: 0.0360342473316948


In [None]:
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.show()

## Random Forest

In [74]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [115]:
clf = RandomForestClassifier(max_depth=20, random_state=0)

In [116]:
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [117]:
print(clf.feature_importances_)

[ 0.11530523  0.03585367  0.09093097  0.50732139  0.05338835  0.10432736
  0.09287303]


In [118]:
y_pred = clf.predict(X_test)

In [119]:
print_scores(y_test, y_pred)

Accuracy: 0.857923497268
Precision: 0.227272727273
Recall: 0.0872093023256
