In [1]:
import pandas as pd
import numpy
from sklearn import model_selection, preprocessing

In [2]:
df = pd.read_csv('Unemployment_mental_illness_survey.csv')
df.head()

Unnamed: 0,I am currently employed at least part-time,I identify as having a mental illness,Education,I have my own computer separate from a smart phone,I have been hospitalized before for my mental illness,How many days were you hospitalized for your mental illness,I am legally disabled,I have my regular access to the internet,I live with my parents,I have a gap in my resume,...,Obsessive thinking,Mood swings,Panic attacks,Compulsive behavior,Tiredness,Age,Gender,Household Income,Region,Device Type
0,0,0,High School or GED,0,0,0.0,0,1,0,1,...,1.0,0.0,1.0,0.0,0.0,30-44,Male,"$25,000-$49,999",Mountain,Android Phone / Tablet
1,1,1,Some Phd,1,0,0.0,0,1,0,0,...,0.0,0.0,1.0,0.0,1.0,18-29,Male,"$50,000-$74,999",East South Central,MacOS Desktop / Laptop
2,1,0,Completed Undergraduate,1,0,0.0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$150,000-$174,999",Pacific,MacOS Desktop / Laptop
3,0,0,Some Undergraduate,1,0,,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$25,000-$49,999",New England,Windows Desktop / Laptop
4,1,1,Completed Undergraduate,1,1,35.0,1,1,0,1,...,1.0,1.0,1.0,1.0,1.0,30-44,Male,"$25,000-$49,999",East North Central,iOS Phone / Tablet


In [11]:
# Original column names are combersome. Change to convenient ones.

clean_df=df.rename(columns={"I am currently employed at least part-time":"employed",
                   "I identify as having a mental illness":"mental_illness",
                   "I have my own computer separate from a smart phone":"own_computer",
                   "I have been hospitalized before for my mental illness":"hospitalized",
                   "How many days were you hospitalized for your mental illness":"hospitalized_days",
                   "I am legally disabled":"disabled","I have my regular access to the internet":"internet",
                   "I live with my parents":"with_parents","I have a gap in my resume":"gap",
                   "Total length of any gaps in my resume in\xa0months.":"months_gap",
                   "Annual income (including any social welfare programs) in USD":"income_with_social",
                   "I am unemployed":"unemployed",
                   "I read outside of work and school":"read",
                   "Annual income from social welfare programs":"welfare_income",
                   "I receive food stamps":"food_stamp",
                   "I am on section 8 housing":"section_8",
                   "How many times were you hospitalized for your mental illness":"times_hospitalized",
                   "Lack of concentration":"Lack_concentration",
                   "Obsessive thinking":"Obsessive_thinking",
                   "Mood swings":"Mood_swings",
                   "Panic attacks":"Panic_attacks",
                   "Compulsive behavior":"Compulsive_behavior",
                   "Household Income":"Household_Income",
                   "Device Type":"Device"
                  })
clean_df.head()

['employed',
 'mental_illness',
 'Education',
 'own_computer',
 'hospitalized',
 'hospitalized_days',
 'disabled',
 'internet',
 'with_parents',
 'gap',
 'months_gap',
 'income_with_social',
 'unemployed',
 'read',
 'welfare_income',
 'food_stamp',
 'section_8',
 'times_hospitalized',
 'Lack_concentration',
 'Anxiety',
 'Depression',
 'Obsessive_thinking',
 'Mood_swings',
 'Panic_attacks',
 'Compulsive_behavior',
 'Tiredness',
 'Age',
 'Gender',
 'Household_Income',
 'Region',
 'Device']

In [12]:
# Eliminate uninformative data

del clean_df['unemployed']
del clean_df['Device']
clean_df = clean_df[clean_df.Household_Income != 'Prefer not to answer']
clean_df.head()

Unnamed: 0,employed,mental_illness,Education,own_computer,hospitalized,hospitalized_days,disabled,internet,with_parents,gap,...,Depression,Obsessive_thinking,Mood_swings,Panic_attacks,Compulsive_behavior,Tiredness,Age,Gender,Household_Income,Region
0,0,0,High School or GED,0,0,0.0,0,1,0,1,...,1,1.0,0.0,1.0,0.0,0.0,30-44,Male,"$25,000-$49,999",Mountain
1,1,1,Some Phd,1,0,0.0,0,1,0,0,...,1,0.0,0.0,1.0,0.0,1.0,18-29,Male,"$50,000-$74,999",East South Central
2,1,0,Completed Undergraduate,1,0,0.0,0,1,0,0,...,0,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$150,000-$174,999",Pacific
3,0,0,Some Undergraduate,1,0,,0,1,1,1,...,0,0.0,0.0,0.0,0.0,0.0,30-44,Male,"$25,000-$49,999",New England
4,1,1,Completed Undergraduate,1,1,35.0,1,1,0,1,...,1,1.0,1.0,1.0,1.0,1.0,30-44,Male,"$25,000-$49,999",East North Central


In [18]:
# Change strings to categories

from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

clean_df = pd.get_dummies(clean_df, columns=['Education', 'Age', 'Gender', 'Region'])
clean_df = clean_df.fillna(0)


In [19]:
# Set household income as the prediction compared to other factors

X = clean_df.drop(columns="Household_Income")
y = clean_df["Household_Income"].values.reshape(-1, 1)
print(X.shape, y.shape)

(296, 47) (296, 1)


In [20]:
# Split training and test data

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = 18)

In [21]:
# Create linear regression model

from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [23]:
# Change string columns to one-hot categories

labelencoder = LabelEncoder()
labelencoder.fit(y_train)
y_train_encoded = labelencoder.transform(y_train)
y_test_encoded = labelencoder.transform(y_test)
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [24]:
# Fit the model to the training data and calculate the scores for the training and testing data

model.fit(x_train, y_train_categorical)
model.coef_

training_score = model.score(x_train, y_train_categorical)
testing_score = model.score(x_test, y_test_categorical)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.2500652066730422
Testing Score: -0.2670421347162763




In [25]:
# Next six cells, code provided by tutor

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [27]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(x_train)
X_train_scaled = X_scaler.transform(x_train)
X_test_scaled = X_scaler.transform(x_test)

In [29]:
model2 = Sequential()
model2.add(Dense(units=100, activation='relu', input_dim=47))
model2.add(Dense(units=10, activation='softmax'))

In [30]:
# Compile and fit the model
model2.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [31]:
# set early stopping as callback
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
model2.fit(
    X_train_scaled,
    y_train_categorical,
    callbacks=callbacks,
    epochs=20,
    shuffle=True,
    verbose=2
)

Epoch 1/20
 - 1s - loss: 2.5267 - accuracy: 0.0495
Epoch 2/20
 - 0s - loss: 2.3534 - accuracy: 0.0811
Epoch 3/20
 - 0s - loss: 2.2352 - accuracy: 0.1216
Epoch 4/20
 - 0s - loss: 2.1438 - accuracy: 0.2252
Epoch 5/20
 - 0s - loss: 2.0769 - accuracy: 0.2658
Epoch 6/20
 - 0s - loss: 2.0234 - accuracy: 0.2703
Epoch 7/20
 - 0s - loss: 1.9805 - accuracy: 0.2568
Epoch 8/20
 - 0s - loss: 1.9461 - accuracy: 0.2748
Epoch 9/20
 - 0s - loss: 1.9132 - accuracy: 0.2928
Epoch 10/20
 - 0s - loss: 1.8818 - accuracy: 0.2928
Epoch 11/20
 - 0s - loss: 1.8547 - accuracy: 0.3153
Epoch 12/20
 - 0s - loss: 1.8264 - accuracy: 0.3514




Epoch 13/20
 - 0s - loss: 1.7997 - accuracy: 0.3694
Epoch 14/20
 - 0s - loss: 1.7732 - accuracy: 0.3964
Epoch 15/20
 - 0s - loss: 1.7492 - accuracy: 0.3919
Epoch 16/20
 - 0s - loss: 1.7255 - accuracy: 0.4054
Epoch 17/20
 - 0s - loss: 1.7010 - accuracy: 0.4099
Epoch 18/20
 - 0s - loss: 1.6792 - accuracy: 0.4144
Epoch 19/20
 - 0s - loss: 1.6561 - accuracy: 0.4324
Epoch 20/20
 - 0s - loss: 1.6343 - accuracy: 0.4505


<keras.callbacks.callbacks.History at 0x27ea53d7d48>

In [33]:
model_loss, model_accuracy = model2.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 2.086697075818036, Accuracy: 0.22972972691059113
