## Imports

In [1]:
from about_data import * 
from clean import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Local file paths 

In [2]:
data_file = '/Users/Winnifred/Desktop/Capstone/ICPSR_20240_RAWDATA/DS0001/20240-0001-Data.tsv'
txt_path = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_group_file_names.txt'
csv_root_path = '/Users/Winnifred/Desktop/Capstone/diagnosis_capstone/data/feature_name_data/'

## Data

In [10]:
messy_df = pd.read_csv(data_file, sep='\t', low_memory=False)

In [4]:
universal_features, nonuniversal_features = get_universals(txt_path, csv_root_path)

In [11]:
universal_df = pd.DataFrame()

In [12]:
for feature in good_features: 
    if feature not in suicidality_features: 
        universal_df[feature] = messy_df[feature]

In [None]:
feature_uniques = {}
for feature in good_features:
    feature_uniques[feature] = list(pd.unique(clean_df[feature]))

In [None]:
feature_uniques['V01638']

## Cleaning Functions Test Area

### Getting the universal features

In [None]:
universal_features, nonuniversal_features = get_universals(txt_path, csv_root_path)

In [6]:
good_features = list(universal_features.keys())

In [None]:
good_features

### Finding features with a specific range of responses

In [None]:
# find the maximum for a feature 
def feature_max(feature):
    x = feature_uniques[feature]
    # x is [' ', '-8', '-9', '1', '5']
    sorted_x = sorted(x, reverse=True)
    # sorted_x is ['5', '1', '-9', '-8', ' ']
    no_space = sorted_x[0:-1]
    # no_space is ['5', '1', '-9', '-8']
    range_list = []
    for i in no_space: 
        range_list.append(float(i))
    # range_list is [5, 1, -9, -8]
    # min(range_list) is -9
    # max(range_list) is 5
    return max(range_list)

In [None]:
features_max5 = []
for feature in good_features: 
    if feature_max(feature) <= 5.0: 
        features_max5.append(feature)

In [None]:
my_clean_df = pd.DataFrame()
for item in features_max5: 
    my_clean_df[item] = messy_df[item]

### These features appear in all three surveys and have a max value of 5

In [None]:
features_max5

What groups do they belong to? 

In [24]:
full_dict = setup(txt_path, csv_root_path)

In [None]:
groups = []
descriptions = []
for feature in features_max5:
    groups.append(get_group(feature, full_dict))
    descriptions.append(get_descriptions(feature, full_dict))
    

In [None]:
def get_group(feature, full_dict):
    for akey, apair in full_dict.items():
        for bkey, bpair in apair.items(): 
            for ckey, cpair in bpair.items(): 
                if feature == ckey: 
                    return bkey

In [None]:
get_group('V01681', full_dict)

In [None]:
def get_descriptions(feature, full_dict):
    for akey, apair in full_dict.items():
        for bkey, bpair in apair.items(): 
            for ckey, cpair in bpair.items(): 
                if feature == ckey: 
                    return cpair[0]    

In [None]:
get_descriptions('V01681', full_dict)

In [None]:
set(groups)

In [None]:
descriptions

## IMPORTANT TO DO ITEMS
- build a basic model

## Things I've Done Today: 
- Look at distribution of features between categories of symptoms and diagnosis after cleaning (i.e. what proportion of each group made it through the universals test AND max_5)

In [13]:
universal_df.head()

Unnamed: 0,V01626,V01627,V01628,V01629,V01630,V01631,V01632,V01633,V01634,V01635,...,V08501,V08500,V08553,V09045,V09048,V07750,V07748,V07899,V09043,V09046
0,,,,,,,,,,,...,5,5,5,,,5,5,5,,
1,,,,,,,,,,,...,5,5,5,,,5,5,5,,
2,,,,,,,,,,,...,5,5,5,,,5,5,5,,
3,,,,,,,,,,,...,5,5,5,,,5,5,5,,
4,,,,,,,,,,,...,5,5,5,,,5,5,5,,


In [16]:
ideation_df = pd.DataFrame()
ideation_df['suicide_ideation'] = universal_df['V01993']

In [19]:
mask = {'5': 0, '1': 1, '-9': 0, '-8': 0, ' ': None}

In [21]:
ideation_df['suicide_ideation'] = ideation_df['suicide_ideation'].map(mask)

In [27]:
screening_dict = full_dict['Screening']['Screening']

In [30]:
# get a list of all the good features from screening section
good_screen_features = []
for feature in screening_dict.keys(): 
    if feature in good_features: 
        good_screen_features.append(feature)

In [32]:
# add the good screening features to the dataframe
for feature in good_screen_features: 
    ideation_df[feature] = universal_df[feature]

In [None]:
# change ints to string through dataframe
for feature in good_screen_features: 
    try: 
        ideation_df[feature] = 
    except: 
        ideation_df[feature] = ideation_df[feature].map({' ': None})

In [42]:
# make a list of features where value is NaN in suicide_ideation column
ideation_df.dropna()

Unnamed: 0,suicide_ideation,V00232,V00233,V00234,V00239,V00252,V00253,V00254,V00255,V00256,...,V00725,V00728,V00729,V00730,V00731,V00738,V00739,V00740,V00741,V00742
0,0.0,3,3,3,0,5,5,5,5,5,...,,5,,,,5,5,5,5,5
2,0.0,3,3,2,0,5,5,5,5,5,...,5,5,,,,5,5,5,5,5
3,0.0,2,2,1,0,5,5,5,5,5,...,5,1,1,5,5,5,1,,,1
5,1.0,1,3,1,0,5,5,5,5,5,...,5,5,,,,5,5,5,5,1
6,0.0,1,2,2,0,1,1,5,5,5,...,1,5,,,,5,1,,,5
7,0.0,1,3,3,0,5,5,5,5,5,...,,1,1,1,1,1,1,,,5
8,0.0,3,2,2,0,5,5,5,5,5,...,1,5,,,,5,5,5,5,5
9,1.0,4,1,1,1,1,5,5,5,5,...,1,1,1,1,5,5,5,5,5,1
10,0.0,1,3,2,0,5,5,5,5,5,...,1,5,,,,5,1,,,5
11,0.0,3,5,4,0,5,5,5,5,5,...,,5,,,,5,5,5,5,5


In [41]:
ideation_df

Unnamed: 0,suicide_ideation,V00232,V00233,V00234,V00239,V00252,V00253,V00254,V00255,V00256,...,V00725,V00728,V00729,V00730,V00731,V00738,V00739,V00740,V00741,V00742
0,0.0,3,3,3,0,5,5,5,5,5,...,,5,,,,5,5,5,5,5
1,,3,3,1,0,5,5,5,5,5,...,,5,,,,5,5,5,5,5
2,0.0,3,3,2,0,5,5,5,5,5,...,5,5,,,,5,5,5,5,5
3,0.0,2,2,1,0,5,5,5,5,5,...,5,1,1,5,5,5,1,,,1
4,,3,1,1,0,5,5,5,5,5,...,5,5,,,,5,5,5,5,5
5,1.0,1,3,1,0,5,5,5,5,5,...,5,5,,,,5,5,5,5,1
6,0.0,1,2,2,0,1,1,5,5,5,...,1,5,,,,5,1,,,5
7,0.0,1,3,3,0,5,5,5,5,5,...,,1,1,1,1,1,1,,,5
8,0.0,3,2,2,0,5,5,5,5,5,...,1,5,,,,5,5,5,5,5
9,1.0,4,1,1,1,1,5,5,5,5,...,1,1,1,1,5,5,5,5,5,1


In [43]:
str_to_int = {}
for number in range(150):
    str_to_int[str(number)] = number

In [None]:
np_array = clean_df.as_matrix()

In [None]:
np_array

In [None]:
my_clean_df

In [9]:
suicidality_features = ['V01992','V01994','V01995','V01996','V01997','V01998','V01999','V02000','V02044','V02045','V02001','V02002','V02003','V02004','V02005','V02009','V02010','V02023','V02024','V02025','V02026','V02027','V02028','V02029','V02030','V02031','V02032','V02033','V02034','V02035','V02036', 'V02037', 'V02041', 'V02042']

In [None]:
label_ideation = 'V01993'

## Model Experimentation Area

Let's try to do some logistic regression on suicide ideation

In [None]:
X = array of all the clean data points, minus the diagnoses 
y = array of labels for suicide 