## Imports 

In [2]:
from SETUP import *
from REDUCE_FEATURES import *
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline

In [3]:
data_file = '/Users/Winnifred/Desktop/Capstone/ICPSR_20240_RAWDATA/DS0001/20240-0001-Data.tsv'
filenames = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_group_file_names.txt'
csv_root_path = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_name_data/'

In [4]:
set_inst = Setup(csv_root_path, filenames)
full_dict = set_inst.execute_setup()
reduce_inst = Reduce_Features(data_file, full_dict)
dirty_df = reduce_inst.execute_reduce()

In [5]:
def get_group(feature, full_dict):
    for akey, apair in full_dict.items():
        for bkey, bpair in apair.items(): 
            if feature == bkey: 
                return akey

In [6]:
def get_descriptions(feature, full_dict):
    for akey, apair in full_dict.items():
        for bkey, bpair in apair.items(): 
            if feature == bkey: 
                return bpair[0]   

# Step 1
## Understand Landscape of DataFrame

In [7]:
dirty_df.head()

Unnamed: 0,V01638,V01639,V01643,V01644,V01646,V01647,V01648,V01649,V01650,V01651,...,V08495,V08549,V07725,V07894,V08501,V08500,V08553,V07750,V07748,V07899
0,,,,,,,,,,,...,5,5,5,5,5,5,5,5,5,5
1,,,,,,,,,,,...,5,5,5,5,5,5,5,5,5,5
2,,,,,,,,,,,...,5,5,1,1,5,5,5,5,5,5
3,,,,,,,,,,,...,5,5,5,5,5,5,5,5,5,5
4,,,,,,,,,,,...,5,5,5,5,5,5,5,5,5,5


In [8]:
description = dirty_df.describe()
# these features (below) are: 
# sampling error stratum, region of country, sex, work status 3 categories, total height-inches, weight in pounds

In [9]:
description

Unnamed: 0,V08929,V08992,V09036,V08172,V07477,V07292,V07297
count,20013.0,20013.0,20013.0,20013.0,20013.0,20013.0,20013.0
mean,0.620247,2.645081,1.572778,2.534353,0.463349,0.19932,1.802828
std,0.485337,1.053107,0.494687,1.061558,0.878212,0.509156,0.875693
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,2.0,1.0,2.0,0.0,0.0,1.0
50%,1.0,3.0,2.0,3.0,0.0,0.0,2.0
75%,1.0,3.0,2.0,3.0,1.0,0.0,2.0
max,1.0,4.0,2.0,4.0,4.0,3.0,5.0


Ok. This dataframe has all of the "universal" features and is limited to those whose input range was between -9 and 5. Typical unique values in a column will be ['-9', '-8', '1', '5', ' ']. 

We are starting out with 20,013 data points and 1,219 features. 

In describe, it only gives me six features. I'd like to see what those are: 

In [10]:
demo_features = list(description)

In [11]:
for feature in demo_features:
    print(feature, get_descriptions(feature, full_dict))

V08929 r interviewed post september 11 2001
V08992 region of country
V09036 sex
V08172 years of education-4 categories
V07477 number of children in hh
V07292 number of adolescents 13-17 in hh
V07297 number of adults in hh


In [12]:
demographic_features = ['V08992', 'V09036', 'V08172', 'V07306']

# Step 2
## 2.A - Create DF with Desired Features

In [21]:
dirty_df['V07306']

KeyError: 'V07306'

In [19]:
def create_basic_df(dirty_df): 
    df = pd.DataFrame()
    df['Region'] = dirty_df['V08992']
    df['Gender'] = dirty_df['V09036']
    df['Yrs of Edu'] = dirty_df['V08172']
    df['Age'] = dirty_df['V07306']
    df['Work Status'] = dirty_df['V09154']
    df['Physical Health'] = dirty_df['V00233']
    df['Mental Health'] = dirty_df['V00234']
    df['Suicide Ideation'] = dirty_df['V01993']
    return df 

In [20]:
df = create_basic_df(dirty_df)

KeyError: 'V07306'

In [None]:
df.head()

## Remove rows where invalid for Suicide Ideation

In [18]:
df = df.drop(df[df['Suicide Ideation'] == ' '].index)

NameError: name 'df' is not defined

In [None]:
df.head()

In [None]:
## Turn strings to ints 
mask = {str(num): num for num in range(150)}

In [None]:
type(df['Work Status'].values[50])

In [None]:
# features that need to be turned from str to int
feat_to_convert = ['Work Status', 'Physical Health', 'Mental Health', 'Suicide Ideation']
for feature in feat_to_convert: 
    df[feature] = df[feature].map(mask)

In [None]:
dict_conv = {'Region': float, 'Gender': float, 'Yrs of Edu': float, 'Age': float}

In [None]:
df = df.astype(dict_conv)

In [None]:
df = df.dropna()

In [None]:
df.describe()

In [None]:
df['Suicide Ideation'].isnull().values.any()

## Let's plot for whatever reason

In [None]:
import seaborn as sns
sns.set(style='ticks')

In [None]:
sns.pairplot(df, hue='Suicide Ideation')
plt.show()

# Step 3  
## Logistic Regression

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [None]:
X_df = df.loc[:, df.columns != 'Suicide Ideation']
y_df = df['Suicide Ideation']

In [None]:
y_df = y_df.map({5:0, 1:1})

In [None]:
X = X_df.as_matrix()
y = y_df.as_matrix()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
logreg = linear_model.LogisticRegression()

In [None]:
model = logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
print('Score:', model.score(X_test, y_test))

In [None]:
def print_scores(y_test, predictions):
    print('Accuracy:', accuracy_score(y_test, predictions))
    print('Precision:', precision_score(y_test, predictions))
    print('Recall:', recall_score(y_test, predictions))
    return None

In [None]:
print_scores(y_test, y_pred)

In [44]:
cnf_matrix = confusion_matrix(y, y)

In [45]:
cnf_matrix

array([[8601,    0],
       [   0, 1156]])

In [49]:
1156/9757

0.1184790406887363

In [50]:
8601/9757

0.8815209593112637

## Linear Regression

In [None]:
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
regr = linear_model.LinearRegression()

In [None]:
regr.fit(X_train, y_train)

In [None]:
y_pred = regr.predict(X_test)

In [None]:
print('Coefficients: \n', regr.coef_)

In [None]:
print("Mean squared error: {}".format(mean_squared_error(y_test, y_pred)))

In [None]:
print('Variance score: {}'.format(r2_score(y_test, y_pred)))

In [None]:
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.show()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
clf = RandomForestClassifier(max_depth=20, random_state=0)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(clf.feature_importances_)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print_scores(y_test, y_pred)

In [None]:
train_pred = model.predict(X_train)
print_scores(y_train, train_pred)