In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
import hashlib
import os
import pickle
import json
import joblib

from sklearn.feature_selection import mutual_info_classif
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,classification_report,auc,precision_score,recall_score,precision_recall_curve,median_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb


### Data loading

In [2]:
with open(os.path.join("data", "train.csv")) as f:
    # read the training dataset
    df = pd.read_csv(f, index_col = 'id')

with open(os.path.join("data", "test.csv")) as f:
    # read the test dataset
    X_test = pd.read_csv(f, index_col = 'id')


X = df.drop(['Hardness'], axis=1)
y = df['Hardness']

### Baseline Model

In [3]:
with open(os.path.join('pickles/baseline', 'columns.json')) as fh:
    columns = json.load(fh)

with open(os.path.join('pickles/baseline', 'dtypes.pickle'), 'rb') as fh:
    dtypes = pickle.load(fh)

with open(os.path.join('pickles/baseline', 'pipeline.pickle'), 'rb') as fh:
    pipeline_base = joblib.load(fh)

features = X.columns

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 42)

X_train_baseline = X_train[features]

pipeline_base.fit(X_train_baseline, y_train)

In [4]:
y_val_pred = pipeline_base.predict(X_val[features])

print('Median Absolute Error score of {}: {}'.format('baseline',median_absolute_error(y_val, y_val_pred)))

Median Absolute Error score of baseline: 0.9555663913173742


# Model improvement

In [5]:
process = preprocessing.MinMaxScaler()

pipeline_light = Pipeline(steps=[('processing',process),
                     ('regressor', lgb.LGBMRegressor(learning_rate=0.09,max_depth=-15,random_state=42))])

pipeline_rf = Pipeline(steps = [('processing', process),
                                ('regressor',RandomForestRegressor(max_depth=15,random_state = 42))])

pipeline_dt = Pipeline(steps=[('processing', process),
                              ('regressor', DecisionTreeRegressor(max_depth=15,random_state = 42))])


pipelines = [pipeline_base, pipeline_light, pipeline_rf, pipeline_dt]
pipe_dict = {0: 'Baseline', 1: 'LightGBM', 2:'Random Forest', 3: 'Decision Tree'}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

best_precision = 0.0
best_classifier = 0
best_medae = 10
best_pipeline = ''
prediction_dict={}

for i, model in enumerate (pipelines) : 
    y_val_pred = model.predict(X_val)
    medae = median_absolute_error(y_val, y_val_pred)

    prediction_dict[pipe_dict[i]] = medae

    if medae < best_medae:
        best_medae = medae
        best_pipeline = model
        best_regressor = i
print('Best Median Absolute Error with {} | MedAE: {}'.format(pipe_dict[best_regressor],best_medae))

Best Median Absolute Error with LightGBM | MedAE: 0.6452822745961022


In [6]:
prediction_dict

{'Baseline': 0.9555663913173742,
 'LightGBM': 0.6452822745961022,
 'Random Forest': 0.645478039694277,
 'Decision Tree': 0.7000000000000002}

### Preparing submission file

In [8]:
# y_test_pred = pipeline_light.predict(X_test)

# output = pd.DataFrame({'id': X_test.index,
#                        'emission': y_test_pred})

# from datetime import date, datetime
# now = datetime.now()
# today_str = now.strftime("%d%m%Y_%H%M")

# subm_file_name = "submissions/submission_"+today_str+".csv"

# output.to_csv(subm_file_name, index=False)