# Standart imports

In [49]:
# Regular EDA and plotting libraries
import numpy as np # np is short for numpy
import pandas as pd # pandas is so commonly used, it's shortened to pd
import matplotlib.pyplot as plt
import seaborn as sns # seaborn gets shortened to sns

# We want our plots to appear in the notebook
%matplotlib inline 

#Models
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

In [50]:
df = pd.read_csv("data/df_train_time_series.csv")
df = df.drop("Unnamed: 0", axis=1)
df

Unnamed: 0,Marka,Arac Tip Grubu,Arac Tip,Model Yıl,Yakıt Turu,Vites,CCM,Beygir Gucu,Renk,Kasa Tipi,...,Arac Tip Grubu_is_missing,Arac Tip_is_missing,Yakıt Turu_is_missing,Vites_is_missing,CCM_is_missing,Beygir Gucu_is_missing,Renk_is_missing,Kasa Tipi_is_missing,Kimden_is_missing,Durum_is_missing
0,14,308,922,2005.0,3,2,1801.0,100.0,8,3,...,False,False,False,False,False,False,False,False,False,False
1,33,233,751,2011.0,3,1,1301.0,100.0,4,6,...,False,False,False,False,False,False,False,False,False,False
2,19,321,604,2012.0,1,1,1801.0,176.0,4,8,...,False,False,False,False,False,False,False,False,False,False
3,14,206,276,2019.0,1,2,1301.0,76.0,4,6,...,False,False,False,False,False,False,False,False,False,False
4,14,196,377,2011.0,3,1,1301.0,76.0,20,6,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5945,24,72,889,2014.0,1,2,1301.0,101.0,4,6,...,False,False,False,False,False,False,False,False,False,False
5946,27,279,460,2013.0,3,1,1301.0,126.0,4,4,...,False,False,False,False,False,False,False,False,False,False
5947,27,279,460,2014.0,3,1,1301.0,101.0,20,4,...,False,False,False,False,False,False,False,False,False,False
5948,28,148,101,2011.0,1,2,1300.0,76.0,10,6,...,False,False,False,False,False,False,False,False,False,False


In [51]:
df_test = pd.read_csv("data/df_test_time_series.csv")
df_test = df_test.drop("Unnamed: 0", axis=1)
df_test

Unnamed: 0,Marka,Arac Tip Grubu,Arac Tip,Model Yıl,Yakıt Turu,Vites,CCM,Beygir Gucu,Renk,Kasa Tipi,...,Arac Tip Grubu_is_missing,Arac Tip_is_missing,Yakıt Turu_is_missing,Vites_is_missing,CCM_is_missing,Beygir Gucu_is_missing,Renk_is_missing,Kasa Tipi_is_missing,Kimden_is_missing,Durum_is_missing
0,22,179,436,2008.0,2,1,1301.0,101.0,22,10,...,False,False,False,False,False,False,False,False,False,False
1,23,55,59,2008.0,3,1,1601.0,100.0,4,10,...,False,False,False,False,False,False,False,False,False,False
2,23,216,510,1993.0,2,1,1801.0,76.0,9,10,...,False,False,False,False,False,False,False,False,False,False
3,23,130,406,2010.0,2,1,1301.0,176.0,17,10,...,False,False,False,False,False,False,False,False,False,False
4,24,31,488,1992.0,2,1,1801.0,76.0,4,10,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,23,55,272,1993.0,2,1,1301.0,76.0,4,10,...,False,False,False,False,False,False,False,False,False,False
1496,23,55,52,2011.0,3,1,1301.0,76.0,9,6,...,False,False,False,False,False,False,False,False,False,False
1497,23,55,54,2014.0,3,1,1301.0,100.0,13,10,...,False,False,False,False,False,False,False,False,False,False
1498,9,112,418,2014.0,3,1,1301.0,76.0,4,10,...,False,False,False,False,False,False,False,False,False,False


In [52]:
# Check train data set
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5940,5941,5942,5943,5944,5945,5946,5947,5948,5949
Marka,14,33,19,14,14,14,29,19,33,1,...,28,28,28,24,24,24,27,27,28,29
Arac Tip Grubu,308,233,321,206,196,206,43,318,210,119,...,148,92,92,120,163,72,279,279,148,25
Arac Tip,922,751,604,276,377,0,500,1024,330,1405,...,127,558,124,1046,1420,889,460,460,101,262
Model Yıl,2005.0,2011.0,2012.0,2019.0,2011.0,2018.0,2015.0,2006.0,2013.0,2004.0,...,2012.0,2006.0,2008.0,2013.0,1993.0,2014.0,2013.0,2014.0,2011.0,2006.0
Yakıt Turu,3,3,1,1,3,3,3,3,1,2,...,3,2,3,1,2,1,3,3,1,3
Vites,2,1,1,2,1,1,2,2,2,1,...,1,1,1,2,1,2,1,1,2,1
CCM,1801.0,1301.0,1801.0,1301.0,1301.0,1301.0,1301.0,2001.0,1301.0,1301.0,...,1301.0,1301.0,1300.0,1301.0,2001.0,1301.0,1301.0,1301.0,1300.0,1301.0
Beygir Gucu,100.0,100.0,176.0,76.0,76.0,100.0,101.0,101.0,176.0,100.0,...,76.0,101.0,76.0,151.0,151.0,101.0,126.0,101.0,76.0,51.0
Renk,8,4,4,4,20,4,4,20,15,8,...,4,4,17,4,17,4,4,20,10,4
Kasa Tipi,3,6,8,6,6,6,3,1,5,10,...,6,6,10,10,10,6,4,4,6,10


In [53]:
# Check Test dataset
df_test.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1490,1491,1492,1493,1494,1495,1496,1497,1498,1499
Marka,22,23,23,23,24,23,24,24,23,23,...,23,9,9,9,23,23,23,23,9,8
Arac Tip Grubu,179,55,216,130,31,216,15,27,55,55,...,55,110,112,63,94,55,55,55,112,174
Arac Tip,436,59,510,406,488,485,160,450,177,299,...,334,47,318,416,24,272,52,54,418,75
Model Yıl,2008.0,2008.0,1993.0,2010.0,1992.0,2006.0,2003.0,2002.0,2015.0,2018.0,...,2005.0,2015.0,2011.0,2013.0,2016.0,1993.0,2011.0,2014.0,2014.0,2009.0
Yakıt Turu,2,3,2,2,2,3,3,1,2,3,...,2,1,1,3,1,2,3,3,3,3
Vites,1,1,1,1,1,2,1,2,1,2,...,1,1,1,1,1,1,1,1,1,1
CCM,1301.0,1601.0,1801.0,1301.0,1801.0,1801.0,1301.0,1301.0,1301.0,1301.0,...,1301.0,1300.0,1601.0,1301.0,1300.0,1301.0,1301.0,1301.0,1301.0,1300.0
Beygir Gucu,101.0,100.0,76.0,176.0,76.0,126.0,76.0,100.0,100.0,126.0,...,101.0,76.0,101.0,101.0,100.0,76.0,76.0,100.0,76.0,76.0
Renk,22,4,9,17,4,22,9,22,9,4,...,26,4,4,4,4,4,9,13,4,5
Kasa Tipi,10,10,10,10,10,10,12,6,10,10,...,6,6,10,7,6,10,6,10,10,6


In [54]:
# Split training data into X & y
X_train = df.drop("Fiyat", axis=1)
y_train = df["Fiyat"]

In [55]:
# Split test data into X & y
X_test = df_test.drop("Fiyat", axis=1)
y_test = df_test["Fiyat"]

In [56]:
# Create evaluation function 
def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate our model
def show_scores(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Valid MAE": mean_absolute_error(y_test, test_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Valid RMSLE": rmsle(y_test, test_preds),
              "Training R^2": model.score(X_train, y_train),
              "Valid R^2": model.score(X_test, y_test)}
    return scores

In [57]:
# For a quick view we will use RandomForestRegressor before HyperParameter Tunning
model = RandomForestRegressor(n_jobs=-1)

In [58]:
%%time
# Fit the model and calculate how much time it is going to spend
model.fit(X_train, y_train)

CPU times: user 4.73 s, sys: 48.6 ms, total: 4.77 s
Wall time: 1.28 s


In [59]:
show_scores(model)

{'Training MAE': 8894.8447707483,
 'Valid MAE': 40385.402589841266,
 'Training RMSLE': 0.09867560591908578,
 'Valid RMSLE': 0.38919270084525265,
 'Training R^2': 0.986345911552408,
 'Valid R^2': 0.6210336456845311}

# Below from here is going to update


>