In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'whitegrid')

import os
path = os.getcwd()
data_path = 'C:\\Users\\sunil\\Projects\\Dockship\\Exam Mark Prediction\\Dataset'
sub_path = 'C:\\Users\\sunil\\Projects\\Dockship\\Exam Mark Prediction\\Submissions'

#### Preprocessing ####
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, LeaveOneOut

le = LabelEncoder()
scaler = MinMaxScaler()

#### Models ####
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.svm import SVR

###
import lightgbm as lgbm

#### Evaluation ####
from sklearn.metrics import mean_squared_error

In [97]:
train = pd.read_csv(data_path+ '\\train.csv')
test = pd.read_csv(data_path+ '\\test.csv')
sample_sub = pd.read_csv(data_path+ '\\sample_submissions.csv')

In [98]:
train.drop('Unnamed: 0', axis = 1, inplace=True)
test.drop('Unnamed: 0', axis = 1, inplace=True)

---
# Preprocessing

In [99]:
df = pd.concat([train, test], axis = 0).reset_index(drop=True)

In [100]:
cat_cols = ['gender', 'ethnicity', 'parental level of education', 'lunch',
       'test preparation course', ]

num_cols = ['reading score', 'writing score']

In [101]:
#df[cat_cols] = df[cat_cols].apply(le.fit_transform)

df = pd.get_dummies(data=df, columns=cat_cols, drop_first=True)

In [102]:
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [103]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop=True)

features = [col for col in test_proc.columns if col not in [target]]

In [104]:
features

['readingscore',
 'writingscore',
 'mathscore',
 'gender_male',
 'ethnicity_groupB',
 'ethnicity_groupC',
 'ethnicity_groupD',
 'ethnicity_groupE',
 'parentallevelofeducation_bachelorsdegree',
 'parentallevelofeducation_highschool',
 'parentallevelofeducation_mastersdegree',
 'parentallevelofeducation_somecollege',
 'parentallevelofeducation_somehighschool',
 'lunch_standard',
 'testpreparationcourse_none']

In [119]:
y = train_proc['mathscore']
kf = KFold(n_splits=5, shuffle=True, random_state=1)
oof = np.zeros(len(train))
score_list = []
fold = 1
test_preds = []
seed_list = [None,2,3]# Use more. Original list: [None,2,3,4,5]


for train_index, test_index in kf.split(train):
    X_train, X_val = train_proc.iloc[train_index], train_proc.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    

    #X_train = X_train.abs() # Taking aboslute was also a bit improving.

    y_pred_list = []
    for seed in seed_list:
        dtrain = lgbm.Dataset(X_train[features], y_train)
        dvalid = lgbm.Dataset(X_val[features], y_val)
        print(seed)
        params = {"objective": "regression",
              "metric": "rmse",
              "verbosity": -1,
              "boosting_type": "gbdt",
              "feature_fraction":0.5,
              "num_leaves": 250,
              #"lambda_l1":7,
              #"lambda_l2":2,
              "learning_rate":0.01,
              'min_child_samples': 35,
              "bagging_fraction":0.75,
              "bagging_freq":1,
             }
        params["seed"] = seed
        model = lgbm.train(params,
                        dtrain,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=100,
                        num_boost_round=100000,
                        early_stopping_rounds=50
                    )

        dtrain = lgbm.Dataset(X_train[features], y_train)
        dvalid = lgbm.Dataset(X_val[features], y_val)
        params = {"objective": "regression",
                  "metric": "rmse",
                  "verbosity": -1,
                  "boosting_type": "gbdt",
                  "feature_fraction":0.5,
                  "num_leaves": 350,
                  #"lambda_l1":7,
                  #"lambda_l2":1,
                  "learning_rate":0.003,
                  'min_child_samples': 35,
                  "bagging_fraction":0.8,
                  "bagging_freq":1,
                 }
        
        params["seed"] = seed
        model = lgbm.train(params,
                            dtrain,
                            valid_sets=[dtrain, dvalid],
                            verbose_eval=100,
                            num_boost_round=100000,
                            early_stopping_rounds=50,
                           init_model = model
                        )

    
    
        y_pred_list.append(model.predict(X_val[features]))
        print(np.sqrt(mean_squared_error(y_val,   np.mean(y_pred_list,axis=0)       )))
        test_preds.append(model.predict(test_proc[features]))
        
    
   
    
    oof[test_index] = np.mean(y_pred_list,axis=0)    
    score = np.sqrt(mean_squared_error(y_val, oof[test_index]))
    score_list.append(score)
    print(f"RMSE Fold-{fold} : {score}")
    fold+=1

np.mean(score_list)

None
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7.46697	valid_1's rmse: 7.4525
[200]	training's rmse: 4.33267	valid_1's rmse: 4.59473
[300]	training's rmse: 3.10659	valid_1's rmse: 3.45247
[400]	training's rmse: 2.66775	valid_1's rmse: 3.00665
[500]	training's rmse: 2.49275	valid_1's rmse: 2.81761
[600]	training's rmse: 2.38382	valid_1's rmse: 2.70347
[700]	training's rmse: 2.32302	valid_1's rmse: 2.64326
[800]	training's rmse: 2.26881	valid_1's rmse: 2.59157
[900]	training's rmse: 2.22248	valid_1's rmse: 2.55031
[1000]	training's rmse: 2.17849	valid_1's rmse: 2.50387
[1100]	training's rmse: 2.13822	valid_1's rmse: 2.46588
[1200]	training's rmse: 2.10477	valid_1's rmse: 2.43477
[1300]	training's rmse: 2.07389	valid_1's rmse: 2.40192
[1400]	training's rmse: 2.04646	valid_1's rmse: 2.38113
[1500]	training's rmse: 2.01903	valid_1's rmse: 2.3576
[1600]	training's rmse: 1.99741	valid_1's rmse: 2.34005
[1700]	training's rmse: 1.97432	valid_1's rmse: 2

Early stopping, best iteration is:
[3444]	training's rmse: 1.84523	valid_1's rmse: 2.20748
2.146763389858425
RMSE Fold-1 : 2.146763389858425
None
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 7.34865	valid_1's rmse: 8.03401
[200]	training's rmse: 4.2424	valid_1's rmse: 5.05871
[300]	training's rmse: 3.03871	valid_1's rmse: 3.87668
[400]	training's rmse: 2.6024	valid_1's rmse: 3.42776
[500]	training's rmse: 2.42737	valid_1's rmse: 3.24251
[600]	training's rmse: 2.32231	valid_1's rmse: 3.13908
[700]	training's rmse: 2.25443	valid_1's rmse: 3.07991
[800]	training's rmse: 2.19973	valid_1's rmse: 3.03752
[900]	training's rmse: 2.15112	valid_1's rmse: 2.99948
[1000]	training's rmse: 2.10965	valid_1's rmse: 2.96402
[1100]	training's rmse: 2.0708	valid_1's rmse: 2.93615
[1200]	training's rmse: 2.03963	valid_1's rmse: 2.91077
[1300]	training's rmse: 2.01126	valid_1's rmse: 2.88796
[1400]	training's rmse: 1.98434	valid_1's rmse: 2.86689
[1500]	training's rms

[5300]	training's rmse: 1.61195	valid_1's rmse: 2.53936
[5400]	training's rmse: 1.60955	valid_1's rmse: 2.53679
[5500]	training's rmse: 1.607	valid_1's rmse: 2.53482
[5600]	training's rmse: 1.60478	valid_1's rmse: 2.53294
[5700]	training's rmse: 1.60228	valid_1's rmse: 2.5309
[5800]	training's rmse: 1.59998	valid_1's rmse: 2.5294
[5900]	training's rmse: 1.59749	valid_1's rmse: 2.5286
[6000]	training's rmse: 1.59497	valid_1's rmse: 2.5265
[6100]	training's rmse: 1.5925	valid_1's rmse: 2.52472
[6200]	training's rmse: 1.58998	valid_1's rmse: 2.52295
[6300]	training's rmse: 1.58751	valid_1's rmse: 2.52095
[6400]	training's rmse: 1.58508	valid_1's rmse: 2.51897
[6500]	training's rmse: 1.58252	valid_1's rmse: 2.51675
[6600]	training's rmse: 1.58029	valid_1's rmse: 2.51488
[6700]	training's rmse: 1.57806	valid_1's rmse: 2.51284
[6800]	training's rmse: 1.57607	valid_1's rmse: 2.51111
Early stopping, best iteration is:
[6799]	training's rmse: 1.57608	valid_1's rmse: 2.51111
2.4972848703910446
3

[900]	training's rmse: 2.43649	valid_1's rmse: 2.35489
[1000]	training's rmse: 2.38478	valid_1's rmse: 2.32222
[1100]	training's rmse: 2.33816	valid_1's rmse: 2.29607
[1200]	training's rmse: 2.29433	valid_1's rmse: 2.25574
[1300]	training's rmse: 2.25737	valid_1's rmse: 2.22893
[1400]	training's rmse: 2.22074	valid_1's rmse: 2.20566
[1500]	training's rmse: 2.192	valid_1's rmse: 2.18852
[1600]	training's rmse: 2.16205	valid_1's rmse: 2.17614
[1700]	training's rmse: 2.13498	valid_1's rmse: 2.15756
[1800]	training's rmse: 2.11152	valid_1's rmse: 2.14668
[1900]	training's rmse: 2.0876	valid_1's rmse: 2.1273
[2000]	training's rmse: 2.06697	valid_1's rmse: 2.12451
Early stopping, best iteration is:
[1969]	training's rmse: 2.07291	valid_1's rmse: 2.12036
Training until validation scores don't improve for 50 rounds
[2000]	training's rmse: 2.07027	valid_1's rmse: 2.11858
[2100]	training's rmse: 2.06049	valid_1's rmse: 2.1129
[2200]	training's rmse: 2.05167	valid_1's rmse: 2.10705
[2300]	trainin

[2300]	training's rmse: 1.90561	valid_1's rmse: 1.98318
[2400]	training's rmse: 1.89047	valid_1's rmse: 1.97392
[2500]	training's rmse: 1.87612	valid_1's rmse: 1.9626
[2600]	training's rmse: 1.86241	valid_1's rmse: 1.9587
[2700]	training's rmse: 1.84889	valid_1's rmse: 1.95181
[2800]	training's rmse: 1.83698	valid_1's rmse: 1.94147
[2900]	training's rmse: 1.82457	valid_1's rmse: 1.92944
Early stopping, best iteration is:
[2940]	training's rmse: 1.81886	valid_1's rmse: 1.9271
Training until validation scores don't improve for 50 rounds
[3000]	training's rmse: 1.81633	valid_1's rmse: 1.92615
[3100]	training's rmse: 1.81113	valid_1's rmse: 1.92517
Early stopping, best iteration is:
[3074]	training's rmse: 1.81287	valid_1's rmse: 1.92451
1.9424033073648492
RMSE Fold-4 : 1.9424033073648492
None
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 6.97265	valid_1's rmse: 6.91192
[200]	training's rmse: 4.03992	valid_1's rmse: 3.80234
[300]	training's rmse: 3.078

2.175801200371677

In [111]:
print(score_list)
np.mean(score_list)

[2.146763389858425, 2.5020675803552264, 2.0570055183503952, 1.9424033073648492, 2.2307662059294873]


2.175801200371677

In [112]:
preds = np.mean(test_preds,axis=0)

In [113]:
sample_sub[target] = np.round(preds)
sample_sub.to_csv(sub_path+'\\base_cross_val.csv', index=False)

In [115]:
lgb = LGBMClassifier()

In [None]:
lgb.feature_importances_