# 1. Package Import

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor

# 2. Data Load

In [5]:
X_test = pd.read_csv("Student_Performance_X_test.csv")
X_train = pd.read_csv("Student_Performance_X_train.csv")
y_train = pd.read_csv("Student_Performance_y_train.csv")

In [8]:
X_train.head()

Unnamed: 0,StudentID,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
0,1714,GP,F,18,U,GT3,T,4,3,other,...,no,4,3,3,1,1,3,0,14,13
1,1254,GP,F,17,U,GT3,T,4,3,health,...,yes,4,4,3,1,3,4,0,13,15
2,1639,GP,F,16,R,GT3,T,4,4,health,...,no,2,4,4,2,3,4,6,10,11
3,1118,GP,M,16,U,GT3,T,4,4,services,...,no,5,3,3,1,3,5,0,15,13
4,1499,GP,M,19,U,GT3,T,3,2,services,...,yes,4,5,4,1,1,4,0,5,0


In [7]:
X_test.head()

Unnamed: 0,StudentID,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2
0,1000,GP,F,16,U,GT3,T,4,2,services,...,no,4,2,3,1,1,5,2,15,16
1,1008,GP,M,19,U,GT3,T,1,2,other,...,no,4,5,2,2,2,4,3,13,11
2,1013,GP,F,16,U,GT3,T,4,4,services,...,no,3,2,3,1,2,2,6,13,14
3,1014,GP,F,16,U,GT3,T,3,1,services,...,no,4,3,3,1,2,5,4,7,7
4,1017,GP,F,15,U,LE3,A,3,4,other,...,yes,5,3,2,1,1,1,0,10,11


In [9]:
y_train.head()

Unnamed: 0,StudentID,G3
0,1714,14
1,1254,15
2,1639,11
3,1118,13
4,1499,0


# 3. Data Preprocessing

## 3.1. EDA(Exploratory Data Analysis)

In [14]:
data = [X_train, X_test, y_train]
for datum in data:
    print(datum.describe())
    print()
    print(datum.info())
    print()
    print(datum.corr())
    print()
    print(datum.columns)
    print()
    print('*  ' * 40)

         StudentID         age        Medu        Fedu  traveltime  \
count   678.000000  678.000000  678.000000  678.000000   678.00000   
mean   1516.181416   16.719764    2.607670    2.371681     1.50590   
std     303.029891    1.239881    1.134682    1.096750     0.73675   
min    1001.000000   15.000000    0.000000    0.000000     1.00000   
25%    1253.250000   16.000000    2.000000    1.000000     1.00000   
50%    1508.000000   17.000000    3.000000    2.000000     1.00000   
75%    1780.750000   18.000000    4.000000    3.000000     2.00000   
max    2043.000000   22.000000    4.000000    4.000000     4.00000   

        studytime    failures      famrel    freetime       goout        Dalc  \
count  678.000000  678.000000  678.000000  678.000000  678.000000  678.000000   
mean     1.989676    0.261062    3.933628    3.233038    3.140118    1.480826   
std      0.848082    0.664340    0.952346    1.042083    1.165537    0.895789   
min      1.000000    0.000000    1.000000    

In [17]:
del_cols = ['StudentID']
num_cols = ['age', 'traveltime', 'studytime','failures', 'absences', 'G1', 'G2']
cat_cols = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup',  'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health']
y_col = ['G3']
X_cols = num_cols + cat_cols

## 3.2 Missing Values

In [18]:
for datum in data:
    print(datum.isnull().sum())
    print('*  ' * 40)
    print()

StudentID     0
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
dtype: int64
*  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  *  

StudentID     0
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
h

## 3.3 Encoding

In [19]:
X = pd.concat([X_train, X_test])
for col in cat_cols:
    print(X[col].unique())

le = LabelEncoder()

for col in cat_cols:
    le.fit(X[col])
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

['GP' 'MS']
['F' 'M']
['U' 'R']
['GT3' 'LE3']
['T' 'A']
[4 3 1 2 0]
[3 4 2 0 1]
['other' 'health' 'services' 'at_home' 'teacher']
['other' 'teacher' 'services' 'at_home' 'health']
['reputation' 'other' 'course' 'home']
['father' 'mother' 'other']
['no' 'yes']
['yes' 'no']
['yes' 'no']
['no' 'yes']
['yes' 'no']
['yes' 'no']
['yes' 'no']
['no' 'yes']
[4 2 5 3 1]
[3 4 5 2 1]
[3 4 2 5 1]
[1 2 3 5 4]
[1 3 2 4 5]
[3 4 5 1 2]


# 4. Modeling

## 4.1. Seperate Data Set

In [20]:
X = X_train[X_cols]
X_test_fin = X_test[X_cols]
y = y_train[y_col]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

## 4.2. Data Scaling

In [21]:
scaler = StandardScaler()
scaler.fit(X_train[num_cols])
X_train[num_cols] = scaler.transform(X_train[num_cols]) 
X_test_fin[num_cols] = scaler.transform(X_test_fin[num_cols]) 
X_val[num_cols] = scaler.transform(X_val[num_cols]) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fin[num_cols] = scaler.transform(X_test_fin[num_cols])


## 4.3. Data Modeling

In [22]:
rf1 = RandomForestRegressor(max_depth=2, random_state=0)
rf1.fit(X_train, y_train.values.ravel())
rf1_pred = rf1.predict(X_val)

xgb1 = XGBRegressor(random_state=0)
xgb1.fit(X_train, y_train)
xgb1_pred = xgb1.predict(X_val)

## 4.4. Model Evaluation

In [23]:
rf1_score = r2_score(y_val, rf1_pred)
xgb1_score = r2_score(y_val, xgb1_pred)
print(f'튜닝 전 -> random forest score: {rf1_score}, xgboost: {xgb1_score}')

튜닝 전 -> random forest score: 0.8137288134470838, xgboost: 0.8065366883847467


## 4.5. Hyper-parameter Tuning

In [10]:
rf_parameters = {
    'max_depth':[2,4,8], 
    'min_samples_leaf':[1, 2, 3],
    'min_samples_split': [2,4,6],
    'n_estimators':[100,500,1000]
    }
rf_cv = GridSearchCV(rf1, rf_parameters, scoring = 'r2', cv=5)
rf_cv.fit(X_train, y_train.values.ravel())
print(rf_cv.best_params_)

xgb1_parameters = {
    'colsample_bytree':[0.6, 0.8, 1.0], 
    'learning_rate':[0.1, 0.2, 0.01],
    'max_depth': [3,6,9],
    'n_estimators':[100,500,1000]
    }
xgb_cv = GridSearchCV(xgb1, xgb1_parameters, scoring = 'r2', cv=5)
xgb_cv.fit(X_train, y_train)
print(xgb_cv.best_params_)

{'max_depth': 6, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 500}
{'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}


## 4.6. Model Selections

In [11]:
rf2 = RandomForestRegressor(max_depth=6, min_samples_leaf=3, min_samples_split=2, n_estimators=500, random_state=0)
rf2.fit(X_train, y_train.values.ravel())
rf2_pred = rf2.predict(X_val)

xgb2 = XGBRegressor(colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=500, random_state=0)
xgb2.fit(X_train, y_train)
xgb2_pred = xgb2.predict(X_val)

rf2_score = r2_score(y_val, rf2_pred)
xgb2_score = r2_score(y_val, xgb2_pred)
print(f'튜닝 후 -> random forest score: {rf2_score}, xgboost: {xgb2_score}')

튜닝 후 -> random forest score: 0.8571565086819269, xgboost: 0.8449625507322935


# 5. Conclusion

In [12]:
rf2_pred_X_test = rf2.predict(X_test_fin)
result = pd.DataFrame({'StudentID':X_test.StudentID, 'G3':rf2_pred_X_test})
print(result.head(10))
result_score = r2_score(y_test.iloc[:,1], rf2_pred_X_test)
print('R2 score: ',result_score.round(4))
result.to_csv('1234.csv',index=False)

   StudentID         G3
0       1000  16.186060
1       1008  11.218828
2       1013  14.249947
3       1014   7.028670
4       1017  10.648040
5       1018  14.298236
6       1021  17.765441
7       1024  12.978773
8       1025  13.043516
9       1031  12.650214
R2 score:  0.8249
