In [1]:
import json
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import BayesianRidge, Lasso, Ridge
from sklearn.svm import SVR

### Preprocessing:

In [2]:
df = pd.read_csv('../datasets/processed_course_clustering_dataset.csv')
df.head()

Unnamed: 0,Course Code,Course Title,Student Number,Department Code,Course Level,Letter Grade,Status,GPA,Standing,Completed Credits,...,C rate,C- rate,D+ rate,D rate,D- rate,F rate,Mean GPA - Students taken,Mean Grade - Students taken,STDEV GPA - Students taken,STDEV Grade - Students taken
0,MGT 511,Küresel Yönetim Düşüncesi,1182,MBA/NT,Graduate,A-,Successful,3.54,Freshman,15,...,0.0,0.0,0.0,0.0,0.0,0.018868,3.363396,3.479245,0.58324,0.712352
1,UNI 123,Textual Analysis and Effective Communication,134,EE,Undergraduate,C-,Successful,3.67,Freshman,18,...,0.045161,0.051613,0.03871,0.058065,0.032258,0.187097,2.41,2.17871,1.082217,1.378734
2,UNI 115,Turkish for International Students I,217,CTV,Undergraduate,A-,Successful,3.7,Freshman,3,...,0.0,0.0,0.0,0.0,0.0,0.04878,2.892927,3.360976,1.048528,0.917845
3,MGT 511,Küresel Yönetim Düşüncesi,1227,MBA/NT,Graduate,B,Successful,3.0,Freshman,15,...,0.0,0.0,0.0,0.0,0.0,0.018868,3.363396,3.479245,0.58324,0.712352
4,MGT 541,Uluslararası Pazarlama,1225,MBA/NT,Graduate,B+,Successful,3.33,Freshman,9,...,0.0,0.0,0.0,0.0,0.0,0.095238,3.285238,3.238095,0.526513,1.155195


In [3]:
df.drop([df.columns[0], df.columns[1], df.columns[2]], inplace=True, axis=1)   # dropping course details

In [4]:
# applying one-hot encoding on categorical features
df = pd.concat([df, pd.get_dummies(df['Course Year'], prefix='Course Year'), pd.get_dummies(df['Subject'], prefix='Subject'), pd.get_dummies(df['Department Code'], prefix='Department Code'), pd.get_dummies(df['Course Level'], prefix='Course Level'), pd.get_dummies(df['Standing'], prefix='Standing'), pd.get_dummies(df['Status'], prefix='Status')], axis=1)
df.drop(['Course Year', 'Subject', 'Department Code', 'Course Level', 'Status', 'Standing'], axis=1, inplace=True)

In [5]:
le = LabelEncoder()
le.fit(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F'])

LabelEncoder()

In [7]:
df.drop('Semester', inplace=True, axis=1)

In [9]:
y = le.transform(df.pop('Letter Grade'))

In [12]:
columns = df.columns
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(df), columns=columns)

### GridSearchCV

In [46]:
tuned_params = {}

In [None]:
parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'], 
              'n_estimators': [100, 200, 400, 800, 1600],
              'learning_rate': [0.1, 0.01, 0.001, 0.0001],
              'max_depth': [3, 9, 27, 81, 243]}

gridsearch = GridSearchCV(GradientBoostingRegressor(), parameters)
gridsearch.fit(X, y)

tuned_params['GradientBoostingRegressor'] = gridsearch.best_params_

In [None]:
parameters = {'n_estimators': [100, 200, 400, 800, 1600]}

gridsearch = GridSearchCV(RandomForestRegressor(), parameters)
gridsearch.fit(X, y)

tuned_params['RandomForestRegressor'] = gridsearch.best_params_

In [None]:
parameters = {'n_estimators': [50, 100, 200, 400, 800, 1600],
              'learning_rate': [1, 0.1, 0.01, 0.001, 0.0001]}

gridsearch = GridSearchCV(AdaBoostRegressor(), parameters)
gridsearch.fit(X, y)

tuned_params['AdaBoostRegressor'] = gridsearch.best_params_

In [None]:
parameters = {'n_estimators': [10, 20, 40, 80, 160, 320]}

gridsearch = GridSearchCV(BaggingRegressor(), parameters)
gridsearch.fit(X, y)

tuned_params['BaggingRegressor'] = gridsearch.best_params_

In [None]:
parameters = {'alpha': [1.0, 2.0, 4.0, 8.0]}

gridsearch = GridSearchCV(Ridge(), parameters)
gridsearch.fit(X, y)

tuned_params['Ridge'] = gridsearch.best_params_

In [None]:
parameters = {'alpha': [1.0, 2.0, 4.0, 8.0]}

gridsearch = GridSearchCV(Lasso(), parameters)
gridsearch.fit(X, y)

tuned_params['Lasso'] = gridsearch.best_params_

In [None]:
parameters = {'alpha_1': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
              'alpha_2': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
              'lambda_1': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3],
              'lambda_2': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3]}

gridsearch = GridSearchCV(BayesianRidge(), parameters)
gridsearch.fit(X, y)

tuned_params['BayesianRidge'] = gridsearch.best_params_

In [None]:
parameters = {'kernel': ['rbf', 'linear'],
              'C': [1.0, 2.0, 4.0, 8.0],
              'epsilom': [1e-1, 1e-2, 1e-3, 1e-4]}

gridsearch = GridSearchCV(SVR(), parameters)
gridsearch.fit(X, y)

tuned_params['SVR'] = gridsearch.best_params_

In [None]:
with open('course_based_tuned_hyperparams.json', 'w') as fw:
    json.dump(tuned_params, fw)