In [27]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from imblearn.over_sampling import SMOTE


In [28]:
filepath = "pumf.csv"
df = pd.read_csv(filepath)
display(df.head())

Unnamed: 0,PUMFID,GEODVPSZ,DHHGMS,DHHGAGE,GENDER,GEN_01,GEN_02A,GEN_08B,GEN_08C,GEN_02A1,...,SDCGRES,SDCGVVM,SDCDVFLA,EDU_05,INCDVP19,INCDVP20,INCDVHH,DABTIPPE,DNBTIPPE,WTS_M
0,615402.0,3.0,1.0,7.0,1.0,2.0,3.0,2.0,3.0,3.0,...,6.0,6.0,2.0,3.0,14.0,14.0,15.0,3.0,1.0,4372.02
1,615403.0,4.0,3.0,2.0,2.0,4.0,3.0,4.0,4.0,3.0,...,6.0,6.0,2.0,6.0,5.0,5.0,13.0,10.0,9.0,2711.23
2,615404.0,4.0,1.0,6.0,1.0,2.0,3.0,3.0,3.0,2.0,...,3.0,1.0,1.0,6.0,12.0,12.0,14.0,7.0,8.0,2475.62
3,615405.0,4.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,1.0,...,6.0,5.0,1.0,6.0,5.0,6.0,14.0,2.0,4.0,2970.67
4,615406.0,4.0,3.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,...,2.0,1.0,1.0,2.0,3.0,6.0,10.0,10.0,10.0,2226.74


In [29]:
df = df.rename(columns={
    'DHHGMS': 'Marital_Status', 
    'GEODVPSZ': 'Population area',
    'DHHGAGE': 'Age',
    'GENDER': 'Gender',
    'SORLGBTS': 'LGBTQ2',
    'SDCFIMM': 'Immigration',
    'SDCGVVM': 'Visible_Minority',
    'CCC_290A': "Anxiety",
    'CCC280BA': 'Depression'
})

df = df.loc[df['Anxiety'].isin([1, 2])]
df = df.loc[df['LGBTQ2'].isin([1, 2])]
df = df.loc[df['Gender'].isin([1, 2])]
df = df.loc[df['Immigration'].isin([1, 2])]

features = [
    'Marital_Status', 'Age', 'Gender', 'LGBTQ2', 
    'Immigration', 'Visible_Minority'
]

df_features = df[features]
df_target = df[['Anxiety']]


# display(df_features)
df_features.reset_index()
display(df_features)

Unnamed: 0,Marital_Status,Age,Gender,LGBTQ2,Immigration,Visible_Minority
0,1.0,7.0,1.0,2.0,2.0,6.0
1,3.0,2.0,2.0,2.0,2.0,6.0
2,1.0,6.0,1.0,2.0,1.0,1.0
3,3.0,2.0,2.0,2.0,2.0,5.0
4,3.0,2.0,1.0,2.0,1.0,1.0
...,...,...,...,...,...,...
9856,3.0,4.0,1.0,2.0,1.0,3.0
9857,3.0,3.0,2.0,1.0,2.0,6.0
9858,3.0,1.0,1.0,2.0,2.0,6.0
9859,3.0,6.0,2.0,2.0,2.0,6.0


In [30]:
# df.head(50).to_csv('output.csv', index=False)

In [31]:
%%script true

X = df_features[features]
y = df_target

categorical_features = ['Marital_Status', 'Gender', 'LGBTQ2', 'Immigration', 'Visible_Minority']
numerical_features = ['Age']

# converts discrete data into numerical data to see if it helps with model classification
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 20% test data

display(X_train)
display(X_test)
display(y_train)
display(y_test)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# scalar = StandardScaler()
# X_train = scalar.fit_transform(X_train)
# X_test = scalar.transform(X_test)

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(X_train.shape[1],)),
  tf.keras.layers.Dense(256, activation='relu'), # hidden layer
  tf.keras.layers.Dense(128, activation='relu'), # hidden layer
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(64, activation='relu'), # hidden layer
  tf.keras.layers.Dropout(0.3),
  tf.keras.layers.Dense(1, activation='sigmoid')
 ])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

predictions = model.predict(X_test)












In [None]:
%%script true


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np


"""
BOILERPLATE
"""

# features
X = df_features[features]
y = df_target  

# converts discrete data into numerical data to see if it helps with model classification
categorical_features = ['Marital_Status', 'Gender', 'LGBTQ2', 'Immigration', 'Visible_Minority']
numerical_features = ['Age']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
])

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

display(X_train)
display(X_test)
display(y_train)
display(y_test)
print("Total: ", len(X_train) + len(X_test))

# X_train = preprocessor.fit_transform(X_train)
# X_test = preprocessor.transform(X_test)

"""
TODO: if single output, use decision tree alone NOT MultiOutputClassifier
"""

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train) # oversample to balance

multi_output_classifier = MultiOutputClassifier(DecisionTreeClassifier(random_state=42)) # 20% test data
multi_output_classifier.fit(X_train, y_train) # train

# decision_tree = DecisionTreeClassifier(random_state=42)
# decision_tree.fit(X_train, y_train)

# make prediction on test data
y_pred = multi_output_classifier.predict(X_test)
# y_pred = decision_tree.predict(X_test)

y_test = np.array(y_test)
y_pred = np.array(y_pred)

# plot accuracy for each target output feature (Source: chatGPT)
for i, column in enumerate(df_target.columns):
    print(f"Target: {column}")
    print(f"Accuracy: {accuracy_score(y_test[:, i], y_pred[:, i])}")
    print("Classification Report:")
    print(classification_report(y_test[:, i], y_pred[:, i]))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test[:, i], y_pred[:, i]))


Unnamed: 0,Marital_Status,Age,Gender,LGBTQ2,Immigration,Visible_Minority
1319,1.0,7.0,1.0,2.0,2.0,6.0
7396,3.0,1.0,1.0,2.0,1.0,4.0
3155,3.0,3.0,2.0,1.0,1.0,2.0
5818,4.0,7.0,2.0,2.0,1.0,3.0
6221,1.0,7.0,2.0,2.0,1.0,2.0
...,...,...,...,...,...,...
5987,3.0,2.0,1.0,2.0,1.0,3.0
5417,3.0,1.0,1.0,2.0,1.0,4.0
5623,1.0,8.0,1.0,2.0,1.0,4.0
913,1.0,6.0,1.0,2.0,1.0,3.0


Unnamed: 0,Marital_Status,Age,Gender,LGBTQ2,Immigration,Visible_Minority
604,1.0,8.0,2.0,2.0,2.0,6.0
3296,1.0,8.0,2.0,2.0,1.0,1.0
6557,3.0,1.0,1.0,2.0,2.0,4.0
2196,1.0,8.0,1.0,2.0,2.0,6.0
7218,4.0,6.0,2.0,2.0,2.0,6.0
...,...,...,...,...,...,...
1829,1.0,8.0,1.0,2.0,2.0,6.0
5357,1.0,5.0,2.0,2.0,1.0,3.0
1134,1.0,7.0,1.0,2.0,2.0,6.0
8841,3.0,1.0,1.0,2.0,1.0,5.0


Unnamed: 0,Anxiety
1319,2.0
7396,2.0
3155,2.0
5818,2.0
6221,2.0
...,...
5987,1.0
5417,2.0
5623,2.0
913,2.0


Unnamed: 0,Anxiety
604,2.0
3296,2.0
6557,2.0
2196,2.0
7218,2.0
...,...
1829,2.0
5357,2.0
1134,1.0
8841,2.0


Total:  9436
Target: Anxiety
Accuracy: 0.7261652542372882
Classification Report:
              precision    recall  f1-score   support

         1.0       0.13      0.44      0.20       148
         2.0       0.94      0.75      0.83      1740

    accuracy                           0.73      1888
   macro avg       0.54      0.59      0.52      1888
weighted avg       0.88      0.73      0.79      1888

Confusion Matrix:
[[  65   83]
 [ 434 1306]]


# Best result was KNN

In [40]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
from sklearn.neighbors import KNeighborsRegressor


"""
BOILERPLATE
"""

# features
X = df_features[features]
y = df_target  

# converts discrete data into numerical data to see if it helps with model classification
categorical_features = ['Marital_Status', 'Gender', 'LGBTQ2', 'Immigration', 'Visible_Minority']
numerical_features = ['Age']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
])

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# display(X_train)
# display(X_test)
# display(y_train)
# display(y_test)
# print("Total: ", len(X_train) + len(X_test))

# X_train = preprocessor.fit_transform(X_train)
# X_test = preprocessor.transform(X_test)

"""
TODO: if single output, use decision tree alone NOT MultiOutputClassifier
"""

print(f"X_train size before: {len(y_train)}")
print(f"y_train size before: {len(y_train)}")

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train) # oversample to balance

print(f"X_train size after SMOTE: {len(y_train)}")
print(f"y_train size after SMOTE: {len(y_train)}")

knn_model = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5)) 
knn_model.fit(X_train, y_train) 

y_pred = knn_model.predict(X_test) 

y_test = np.array(y_test) 
y_pred = np.array(y_pred)


# for i, column in enumerate(df_target.columns):
# print(f"\nTarget: {column}")
for i, column in enumerate(df_target.columns): 
    print(f"\nTarget: {column}")
    print(f"Accuracy: {accuracy_score(y_test[:, i], y_pred[:, i])}")
    print("Classification Report:")
    print(classification_report(y_test[:, i], y_pred[:, i]))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test[:, i], y_pred[:, i]))


X_train size before: 7548
y_train size before: 7548
X_train size after SMOTE: 13844
y_train size after SMOTE: 13844

Target: Anxiety
Accuracy: 0.8972457627118644
Classification Report:
              precision    recall  f1-score   support

         1.0       0.19      0.09      0.13       148
         2.0       0.93      0.97      0.95      1740

    accuracy                           0.90      1888
   macro avg       0.56      0.53      0.54      1888
weighted avg       0.87      0.90      0.88      1888

Confusion Matrix:
[[  14  134]
 [  60 1680]]


In [34]:
import joblib
joblib.dump(knn_model, "knn_model.pkl")


['knn_model.pkl']