In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot
import missingno as msno

import tensorflow as tf
from catboost import CatBoostRegressor

# **Importing and reading csv file**

In [57]:
data = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')

print(data.shape)
data.head()


(2133, 11)


Unnamed: 0,school,school_setting,school_type,classroom,teaching_method,n_student,student_id,gender,lunch,pretest,posttest
0,ANKYI,Urban,Non-public,6OL,Standard,20.0,2FHT3,Female,Does not qualify,62.0,72.0
1,ANKYI,Urban,Non-public,6OL,Standard,20.0,3JIVH,Female,Does not qualify,66.0,79.0
2,ANKYI,Urban,Non-public,6OL,Standard,20.0,3XOWE,Male,Does not qualify,64.0,76.0
3,ANKYI,Urban,Non-public,6OL,Standard,20.0,556O0,Female,Does not qualify,61.0,77.0
4,ANKYI,Urban,Non-public,6OL,Standard,20.0,74LOE,Male,Does not qualify,64.0,76.0


# **Cleaning dataset**

In [58]:
data.drop(["student_id","classroom"], axis= 1, inplace= True)

## Changing school type to dummy var

In [59]:
data['school_type'].replace(to_replace = 'Non-public', value = 1, inplace=True)
data['school_type'].replace(to_replace = 'Public', value = 0, inplace=True)

## Changing teaching method to dummy var

In [60]:
data['teaching_method'].replace(to_replace = 'Experimental', value = 1, inplace=True)
data['teaching_method'].replace(to_replace = 'Standard', value = 0, inplace=True)

## Changing gender to dummy var

In [61]:
data['gender'].replace(to_replace = 'Male', value = 1, inplace=True)
data['gender'].replace(to_replace = 'Female', value = 0, inplace=True)

## Changing Lunch to dummy var

In [62]:
data['lunch'].replace(to_replace = 'Qualifies for reduced/free lunch', value = 1, inplace=True)
data['lunch'].replace(to_replace = 'Does not qualify', value = 0, inplace=True)

In [63]:
data.head(20)

Unnamed: 0,school,school_setting,school_type,teaching_method,n_student,gender,lunch,pretest,posttest
0,ANKYI,Urban,1,0,20.0,0,0,62.0,72.0
1,ANKYI,Urban,1,0,20.0,0,0,66.0,79.0
2,ANKYI,Urban,1,0,20.0,1,0,64.0,76.0
3,ANKYI,Urban,1,0,20.0,0,0,61.0,77.0
4,ANKYI,Urban,1,0,20.0,1,0,64.0,76.0
5,ANKYI,Urban,1,0,20.0,0,0,66.0,74.0
6,ANKYI,Urban,1,0,20.0,1,0,63.0,75.0
7,ANKYI,Urban,1,0,20.0,0,0,63.0,72.0
8,ANKYI,Urban,1,0,20.0,1,0,64.0,77.0
9,ANKYI,Urban,1,0,20.0,0,0,61.0,72.0


In [64]:
data.info()
data.describe()
for d in data.columns:
    print("{0} : {1}".format(d, data[d].nunique()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2133 entries, 0 to 2132
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   school           2133 non-null   object 
 1   school_setting   2133 non-null   object 
 2   school_type      2133 non-null   int64  
 3   teaching_method  2133 non-null   int64  
 4   n_student        2133 non-null   float64
 5   gender           2133 non-null   int64  
 6   lunch            2133 non-null   int64  
 7   pretest          2133 non-null   float64
 8   posttest         2133 non-null   float64
dtypes: float64(3), int64(4), object(2)
memory usage: 150.1+ KB
school : 23
school_setting : 3
school_type : 2
teaching_method : 2
n_student : 18
gender : 2
lunch : 2
pretest : 69
posttest : 68


In [65]:
categories = ['school','school_setting', 'school_type', 'teaching_method', 'gender', 'lunch']
data = pd.get_dummies(data, columns=categories, drop_first=True)
data.head()

Unnamed: 0,n_student,pretest,posttest,school_CCAAW,school_CIMBB,school_CUQAM,school_DNQDD,school_FBUMG,school_GJJHK,school_GOKXL,...,school_VKWQH,school_VVTVA,school_ZMNYA,school_ZOWMK,school_setting_Suburban,school_setting_Urban,school_type_1,teaching_method_1,gender_1,lunch_1
0,20.0,62.0,72.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
1,20.0,66.0,79.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
2,20.0,64.0,76.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
3,20.0,61.0,77.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,20.0,64.0,76.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0


#### Shuffle the data

In [66]:
data = data.sample(frac=1).reset_index(drop=True)

In [67]:
data.head(20)

Unnamed: 0,n_student,pretest,posttest,school_CCAAW,school_CIMBB,school_CUQAM,school_DNQDD,school_FBUMG,school_GJJHK,school_GOKXL,...,school_VKWQH,school_VVTVA,school_ZMNYA,school_ZOWMK,school_setting_Suburban,school_setting_Urban,school_type_1,teaching_method_1,gender_1,lunch_1
0,16.0,69.0,87.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0
1,26.0,36.0,59.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
2,30.0,34.0,53.0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,1,1
3,21.0,59.0,70.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21.0,59.0,71.0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,24.0,42.0,50.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
6,28.0,58.0,65.0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
7,17.0,68.0,85.0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0
8,21.0,58.0,71.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,22.0,43.0,57.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Preparing for Training:
    Assigning X and Y

In [68]:
X = data.drop('posttest', axis=1)
y = data['posttest']

In [69]:
X

Unnamed: 0,n_student,pretest,school_CCAAW,school_CIMBB,school_CUQAM,school_DNQDD,school_FBUMG,school_GJJHK,school_GOKXL,school_GOOBU,...,school_VKWQH,school_VVTVA,school_ZMNYA,school_ZOWMK,school_setting_Suburban,school_setting_Urban,school_type_1,teaching_method_1,gender_1,lunch_1
0,16.0,69.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0
1,26.0,36.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1
2,30.0,34.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,1,1
3,21.0,59.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21.0,59.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2128,25.0,37.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
2129,24.0,40.0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,1,1
2130,22.0,80.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2131,21.0,79.0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [70]:
y

0       87.0
1       59.0
2       53.0
3       70.0
4       71.0
        ... 
2128    49.0
2129    49.0
2130    87.0
2131    89.0
2132    74.0
Name: posttest, Length: 2133, dtype: float64

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [74]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluation(y_true, y_pred):
    print("R2 score: ", r2_score(y_true, y_pred))
    print("Mean Absolute Error: ", mean_absolute_error(y_true, y_pred))
    print("Mean Squared Error: ", mean_squared_error(y_true, y_pred))

In [81]:
import lightgbm as lgbm


lgb_model = lgbm.LGBMRegressor(loss_function='RMSE', random_state=142)

lgb_model.fit(X_train, y_train, 
        eval_set=[(X_test, y_test)],  
        early_stopping_rounds=100, 
        verbose=20)
pred = lgb_model.predict(X_test)
print(np.sqrt(mean_squared_error(pred, y_test)))
evaluation(y_true=y_test, y_pred = lgb_model.predict(X_test))

Training until validation scores don't improve for 100 rounds
[20]	valid_0's l2: 12.3875
[40]	valid_0's l2: 9.35822
[60]	valid_0's l2: 9.46198
[80]	valid_0's l2: 9.4874
[100]	valid_0's l2: 9.5302
Did not meet early stopping. Best iteration is:
[41]	valid_0's l2: 9.35165
3.0580461539818895
R2 score:  0.9510414801498062
Mean Absolute Error:  2.354131822617555
Mean Squared Error:  9.351646279883427


In [75]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
evaluation(y_true=y_test, y_pred = lr.predict(X_test))

R2 score:  0.9466147702537123
Mean Absolute Error:  2.5136311486994742
Mean Squared Error:  10.197199316588758


In [83]:
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error

ridge_cv = RidgeCV(alphas = np.logspace(-10, 6, 20))
ridge_cv.fit(X_train, y_train)
ridge = Ridge(alpha = ridge_cv.alpha_)
ridge.fit(X_train, y_train)
evaluation(y_true=y_test, y_pred = ridge.predict(X_test))

R2 score:  0.946586477824363
Mean Absolute Error:  2.5119134994496135
Mean Squared Error:  10.20260349940481
