In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


import pickle

In [2]:
performance_data = pd.read_csv("data/student_performance_subset.csv")
performance_data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,math score,reading score,writing score
0,female,group B,bachelor's degree,72,72,74
1,female,group C,some college,69,90,88
2,female,group B,master's degree,90,95,93
3,male,group A,associate's degree,47,57,44
4,male,group C,some college,76,78,75


In [3]:
y = performance_data["writing score"]

temp = performance_data.drop(columns=["writing score"])

enc = LabelEncoder()


enc.fit(temp['parental level of education'])
parental_edu_level_dict = dict(zip(temp['parental level of education'], enc.transform(temp['parental level of education'])))
parental_edu_level = enc.transform(temp['parental level of education'])

enc.fit(temp['race/ethnicity'])
ethnicity_dict = dict(zip(temp['race/ethnicity'], enc.transform(temp['race/ethnicity'])))
ethnicity = enc.transform(temp['race/ethnicity'])

enc.fit(temp['gender'])
gender_dict = dict(zip(temp['gender'], enc.transform(temp['gender'])))
gender = enc.transform(temp['gender'])

math_score = temp['math score']
reading_score = temp['reading score']
X = list(zip(gender, ethnicity, parental_edu_level, math_score, reading_score))

In [4]:
pickle.dump([gender_dict, ethnicity_dict, parental_edu_level_dict], open("dictonaries.pkl", "wb"))

out = pickle.load(open("dictonaries.pkl", "rb"))

out

[{'female': 0, 'male': 1},
 {'group B': 1, 'group C': 2, 'group A': 0, 'group D': 3, 'group E': 4},
 {"bachelor's degree": 1,
  'some college': 4,
  "master's degree": 3,
  "associate's degree": 0,
  'high school': 2,
  'some high school': 5}]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

linear_regression_model = LinearRegression()
lasso_model = Lasso()
random_forest_model = RandomForestRegressor()
svr_model = SVR()

linear_regression_model.fit(x_train, y_train)
lasso_model.fit(x_train, y_train)
random_forest_model.fit(x_train, y_train)
svr_model.fit(x_train, y_train)

acc_lr = linear_regression_model.score(x_test, y_test)
acc_lasso = lasso_model.score(x_test, y_test)
acc_rf = random_forest_model.score(x_test, y_test)
acc_svr = svr_model.score(x_test, y_test)

print(f"Accuracy: lr {acc_lr} lasso {acc_lasso} rf {acc_rf} svr {acc_svr}")

Accuracy: lr 0.9332063395339473 lasso 0.9167224337509088 rf 0.9221780829659164 svr 0.8771528058319301


In [6]:
test = linear_regression_model.predict([[gender_dict["male"], ethnicity_dict["group B"], parental_edu_level_dict["associate's degree"], 50, 70]])

test

array([62.13088813])

In [7]:
pickle.dump(lasso_model,open("trained_models/lasso_model.pkl", "wb"))

In [8]:
pickle.dump(linear_regression_model,open("trained_models/linear_regression_model.pkl", "wb"))

In [9]:
pickle.dump(svr_model,open("trained_models/svr_model.pkl", "wb"))

In [10]:
pickle.dump(random_forest_model,open("trained_models/random_forest_model.pkl", "wb"))