# Student Performance Data Set
[Student Performance Data Set](https://archive.ics.uci.edu/ml/datasets/student+performance)
[ch10-xgboost-model-deployment.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch10-xgboost-model-deployment.ipynb)

In [2]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv')
df.head()

Unnamed: 0,school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0,"GP;NaN;18;""U"";""GT3"";""A"";4;4;""at_home"";""teacher..."
1,"GP;""F"";NaN;""U"";""GT3"";""T"";1;1;""at_home"";""other""..."
2,"GP;""F"";15;""U"";""LE3"";""T"";1;1;""at_home"";""other"";..."
3,"GP;""F"";15;""U"";""GT3"";""T"";4;2;""health"";""services..."
4,"GP;""F"";16;""U"";""GT3"";""T"";3;3;""other"";""other"";""h..."


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter10/student-por.csv', sep=';')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [4]:
df.isnull().sum()

school        0
sex           1
age           1
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      1
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [5]:
df[df.isna().any(axis=1)]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11


In [6]:
pd.options.display.max_columns = None

In [7]:
df[df.isna().any(axis=1)]

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,,18.0,U,GT3,A,4,4,at_home,teacher,course,,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11


In [8]:
df['age'] = df['age'].fillna(-999.0)

In [9]:
df['sex'] = df['sex'].fillna(df['sex'].mode())
df['guardian'] = df['guardian'].fillna(df['guardian'].mode())

In [10]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18.0,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11
1,GP,F,-999.0,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11
2,GP,F,15.0,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12
3,GP,F,15.0,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14
4,GP,F,16.0,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13


In [12]:
#df.to_csv('student-por_cleaned.csv', index=False)

### one-hot encoding

In [13]:
categorical_columns = df.columns[df.dtypes==object].tolist()

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

hot = ohe.fit_transform(df[categorical_columns])

hot_df = pd.DataFrame(hot.toarray())

hot_df.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [14]:
print(hot)

  (0, 0)	1.0
  (0, 2)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 8)	1.0
  (0, 10)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 25)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 31)	1.0
  (0, 33)	1.0
  (0, 36)	1.0
  (0, 38)	1.0
  (0, 39)	1.0
  (0, 41)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 5)	1.0
  (1, 6)	1.0
  (1, 9)	1.0
  (1, 10)	1.0
  (1, 17)	1.0
  (1, 20)	1.0
  :	:
  (647, 27)	1.0
  (647, 29)	1.0
  (647, 31)	1.0
  (647, 33)	1.0
  (647, 35)	1.0
  (647, 38)	1.0
  (647, 40)	1.0
  (647, 41)	1.0
  (648, 1)	1.0
  (648, 3)	1.0
  (648, 4)	1.0
  (648, 7)	1.0
  (648, 9)	1.0
  (648, 13)	1.0
  (648, 17)	1.0
  (648, 20)	1.0
  (648, 25)	1.0
  (648, 27)	1.0
  (648, 29)	1.0
  (648, 31)	1.0
  (648, 33)	1.0
  (648, 35)	1.0
  (648, 38)	1.0
  (648, 40)	1.0
  (648, 41)	1.0


In [15]:
hot

<649x43 sparse matrix of type '<class 'numpy.float64'>'
	with 11033 stored elements in Compressed Sparse Row format>

In [16]:
cold_df = df.select_dtypes(exclude=["object"])
cold_df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,18.0,4,4,2,2,0,4,3,4,1,1,3,4,0,11,11
1,-999.0,1,1,1,2,0,5,3,3,1,1,3,2,9,11,11
2,15.0,1,1,1,2,0,4,3,2,2,3,3,6,12,13,12
3,15.0,4,2,1,3,0,3,2,2,1,1,5,0,14,14,14
4,16.0,3,3,1,2,0,4,3,2,1,2,5,0,11,13,13


In [17]:
from scipy.sparse import csr_matrix
cold = csr_matrix(cold_df)

from scipy.sparse import hstack
final_sparse_matrix = hstack((hot, cold))

final_df = pd.DataFrame(final_sparse_matrix.toarray())
final_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,18.0,4.0,4.0,2.0,2.0,0.0,4.0,3.0,4.0,1.0,1.0,3.0,4.0,0.0,11.0,11.0
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,-999.0,1.0,1.0,1.0,2.0,0.0,5.0,3.0,3.0,1.0,1.0,3.0,2.0,9.0,11.0,11.0
2,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,15.0,1.0,1.0,1.0,2.0,0.0,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,15.0,4.0,2.0,1.0,3.0,0.0,3.0,2.0,2.0,1.0,1.0,5.0,0.0,14.0,14.0,14.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,16.0,3.0,3.0,1.0,2.0,0.0,4.0,3.0,2.0,1.0,2.0,5.0,0.0,11.0,13.0,13.0


In [18]:
### XGB

In [29]:
y = final_df.iloc[:, -1]
X = final_df.iloc[:, :-3]

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [31]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error as MSE
from xgboost import XGBRegressor

In [32]:
y_train.value_counts()

11.0    82
10.0    75
13.0    58
12.0    53
14.0    42
15.0    36
9.0     29
16.0    27
8.0     26
17.0    24
18.0    14
0.0     10
7.0      7
5.0      1
19.0     1
6.0      1
Name: 58, dtype: int64

In [33]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

In [34]:
def cross_val(model):
    scores = cross_val_score(model, X_train, y_train, 
                             scoring='neg_root_mean_squared_error', cv=kfold)
    rmse = (-scores.mean())
    return rmse

In [35]:
cross_val(XGBRegressor(missing=-999.0))

2.9004041754792746