In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import pickle

In [2]:
stress_data = pd.read_csv("data/stress_subset.csv", sep=",")
stress_data.head()

Unnamed: 0,gender,ethnicity,age,country,pg_study,stress_level
0,Female,White,27,International,PhD,To a large extent
1,Female,White,26,EU,PhD,Somewhat
2,Female,White,53,UK,PhD,To a large extent
3,Female,Mixed / Multiple Ethnic Groups,29,UK,PhD,Somewhat
4,Female,White,22,UK,Research master's degree,Somewhat


In [3]:

y = stress_data['stress_level'] 

temp = stress_data.drop(columns=['stress_level'])

enc = LabelEncoder()
enc.fit(temp['country'])
country_dict = dict(zip(temp['country'], enc.transform(temp['country'])))
country = enc.transform(temp['country'])

enc.fit(temp['pg_study'])
pg_study_dict = dict(zip(temp['pg_study'], enc.transform(temp['pg_study'])))
pg_study = enc.transform(temp['pg_study'])

enc.fit(temp['ethnicity'])
ethnicity_dict = dict(zip(temp['ethnicity'], enc.transform(temp['ethnicity'])))
ethnicity = enc.transform(temp['ethnicity'])

enc.fit(temp['gender'])
gender_dict = dict(zip(temp['gender'], enc.transform(temp['gender'])))
gender = enc.transform(temp['gender'])

age = temp['age']
X = list(zip(gender, ethnicity, age, country, pg_study))

In [4]:
pickle.dump([gender_dict, ethnicity_dict, country_dict, pg_study_dict], open("stress_dictonaries.pkl", "wb"))

out = pickle.load(open("stress_dictonaries.pkl", "rb"))

out

[{'Female': 0, 'Male': 1, 'Prefer not to say': 2},
 {'White': 4,
  'Mixed / Multiple Ethnic Groups': 2,
  'Asian / Asian British': 0,
  'Other': 3,
  'Black / African / Caribbean / Black British': 1},
 {'International': 1, 'EU': 0, 'UK': 2},
 {'PhD': 1,
  "Research master's degree": 2,
  "Taught master's degree": 3,
  'Other': 0}]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

svm_model = SVC(kernel="linear")
decision_tree_model = DecisionTreeClassifier()
knn_model = KNeighborsClassifier(n_neighbors=5)
naive_bayes_model = CategoricalNB()

svm_model.fit(x_train, y_train)
decision_tree_model.fit(x_train, y_train)
knn_model.fit(x_train, y_train)
naive_bayes_model.fit(x_train, y_train)

y_pred_svm = svm_model.predict(x_test)
y_pred_dt = decision_tree_model.predict(x_test)
y_pred_knn = knn_model.predict(x_test)
y_pred_nb = naive_bayes_model.predict(x_test)

acc_svm = accuracy_score(y_test, y_pred_svm)
acc_dt = accuracy_score(y_test, y_pred_dt)
acc_knn = accuracy_score(y_test, y_pred_knn)
acc_nb = accuracy_score(y_test, y_pred_nb)

print(f"Accuracy: svm {acc_svm} dt {acc_dt} knn {acc_knn} nb {acc_nb}")

Accuracy: svm 0.6666666666666666 dt 0.36363636363636365 knn 0.36363636363636365 nb 0.6666666666666666


In [13]:
test = svm_model.predict([[gender_dict["Male"], ethnicity_dict["Mixed / Multiple Ethnic Groups"], 25, country_dict["UK"], pg_study_dict["Taught master's degree"]]])

test

array(['To a large extent'], dtype=object)

In [14]:
pickle.dump(svm_model,open("trained_models/svm_model.pkl", "wb"))

In [15]:
pickle.dump(decision_tree_model,open("trained_models/decision_tree_model.pkl", "wb"))

In [16]:
pickle.dump(knn_model,open("trained_models/knn_model.pkl", "wb"))

In [17]:
pickle.dump(naive_bayes_model,open("trained_models/naive_bayes_model.pkl", "wb"))