In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Exploration

In [2]:
stu_df = pd.read_csv('StudentsPerformance.csv')
stu_df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [3]:
stu_df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [4]:
stu_df.isnull().sum()

Unnamed: 0,0
gender,0
race/ethnicity,0
parental level of education,0
lunch,0
test preparation course,0
math score,0
reading score,0
writing score,0


In [8]:
stu_df['race/ethnicity'].value_counts()

Unnamed: 0_level_0,count
race/ethnicity,Unnamed: 1_level_1
group C,319
group D,262
group B,190
group E,140
group A,89


# Feature Engineering

In [3]:
stu_df.columns = stu_df.columns.str.replace(' ', '_')
stu_df.head()

Unnamed: 0,gender,race/ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
num_df = stu_df.drop(columns = ['gender', 'parental_level_of_education', 'lunch', 'test_preparation_course'])
num_df.head()

Unnamed: 0,race/ethnicity,math_score,reading_score,writing_score
0,group B,72,72,74
1,group C,69,90,88
2,group B,90,95,93
3,group A,47,57,44
4,group C,76,78,75


In [5]:
num_df['race/ethnicity'].replace({'group A': 1, 'group B': 2, 'group C': 3, 'group D': 4, 'group E': 5}, inplace = True)
num_df.head()

Unnamed: 0,race/ethnicity,math_score,reading_score,writing_score
0,2,72,72,74
1,3,69,90,88
2,2,90,95,93
3,1,47,57,44
4,3,76,78,75


In [17]:
num_df.corr()

Unnamed: 0,race/ethnicity,math_score,reading_score,writing_score
race/ethnicity,1.0,0.216415,0.145253,0.165691
math_score,0.216415,1.0,0.81758,0.802642
reading_score,0.145253,0.81758,1.0,0.954598
writing_score,0.165691,0.802642,0.954598,1.0


# Build Models

In [6]:
# linear SVC

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [7]:
X = num_df.drop(columns = ['race/ethnicity'])
y = num_df['race/ethnicity']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [9]:
SVM = SVC(kernel = 'linear', random_state = 0)
SVM.fit(X_train, y_train)

In [10]:
SVM.fit(X_train, y_train)

y_pred = SVM.predict(X_test)

In [28]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[ 0  0 18  2  0]
 [ 0  0 38  5  0]
 [ 0  0 72  9  0]
 [ 0  0 59 16  0]
 [ 0  0 25  6  0]]


In [11]:
score = SVM.score(X_test, y_test)
print(score)

0.352


In [30]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

In [31]:
NB = GaussianNB()
NB.fit(X_train, y_train)

In [32]:
y_pred = NB.predict(X_test)

In [33]:
score = NB.score(X_test, y_test)
print(score)

0.268


In [34]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

In [36]:
DTC = DecisionTreeClassifier(criterion = 'gini', random_state = 0)
DTC.fit(X_train, y_train)

In [37]:
y_pred = DTC.predict(X_test)

In [38]:
score = DTC.score(X_test, y_test)
print(score)

0.272


In [39]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

In [40]:
KNN = KNeighborsClassifier(n_neighbors = 5)
KNN.fit(X_train, y_train)

In [41]:
y_pred = KNN.predict(X_test)

In [42]:
score = KNN.score(X_test, y_test)
print(score)

0.3


# Save Model to Pkl

In [12]:
import pickle

In [14]:
with open('SVM_model.pkl', 'wb') as file:
  pickle.dump(SVM, file)