In [1]:

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from ml_classes import *


In [4]:
int_remove_cols = ["carmaker", "transmission", "drive", "car_style",
                    "emission_class", "door_count", "color", "carmaker_volkswagen", "transmission_manual", "drive_4x2", "car_style_sedan", "door_count_4_5", "color_s", "country_origin_Germany", 'emission_class_old', 'fuel_type', 'fuel_type_petrol', 'country_origin']
cat_remove_cols = ['country_origin_Austria', 'country_origin_Belgium',
                    'country_origin_Czech Republic', 'country_origin_France',
                    'country_origin_Germany', 'country_origin_Italy',
                    'country_origin_Luxembourg', 'country_origin_Netherlands',
                    'country_origin_Poland', 'country_origin_Romania',
                    'country_origin_Slovakia', 'country_origin_Spain',
                    'country_origin_Sweden', 'color_n', 'color_s', 
                    'door_count_2_3', 'door_count_4_5', 'emission_class_new',
                    'emission_class_old', 'car_style_large',
                    'car_style_offroad', 'car_style_sedan', 'car_style_sport',
                    'car_style_station_wagon', 'drive_4x2', 'drive_4x4', 
                    'transmission_automatic', 'transmission_manual',
                    'fuel_type_diesel', 'fuel_type_petrol', 'carmaker_audi', 'carmaker_bmw', 'carmaker_citroen', 'carmaker_dacia', 'carmaker_fiat', 'carmaker_ford', 'carmaker_hyundai', 'carmaker_jeep', 'carmaker_kia', 'carmaker_land_rover', 'carmaker_mazda', 'carmaker_mercedes_benz', 'carmaker_mini', 'carmaker_nissan', 'carmaker_opel',
                    'carmaker_other_brand', 'carmaker_peugeot', 'carmaker_porsche', 'carmaker_renault', 'carmaker_seat', 'carmaker_skoda', 'carmaker_toyota', 'carmaker_volkswagen', 'carmaker_volvo']
int_cont_cols = ["power", "cubic_capacity", 
                 "fuel_consumption_combined", "co2_emission", "vehicle_age"]

In [5]:
df_comparison = pd.DataFrame(columns=["method", "MSE_train", "MAD_train",
                                       "R2_train", "MSE_test", "MAD_test", "R2_test"])

In [13]:
#read data
scaler = StandardScaler()
data = pd.read_csv("../data/data_pred.csv")
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[int_cont_cols] = scaler.fit_transform(X[int_cont_cols])


In [None]:
#prepare data for estimation
columns =  data.columns
x_columns = list(data.columns)
x_columns.remove("price")
x_columns_int = x_columns.copy()
x_columns_cat = x_columns.copy()
for column in data.select_dtypes(include=['bool']):
    data[column] = data[column].astype(int) 
for col in int_remove_cols:
    x_columns_int.remove(col)
for col in cat_remove_cols:
    x_columns_cat.remove(col)
X = data[x_columns]

#scale continous variables
X[int_cont_cols] = scaler.fit_transform(X[int_cont_cols])

#target variable - price  
y = data["price"]

#data for  - OLS, K-Neighbors..
X_num = X[x_columns_int]

#data for Decisions trees etc.
X_cat = X[x_columns_cat]

In [10]:
#class used for storing results using several different ML algorithms
predict_num = PredictionSummary()
#create train/test data sets inside class for estimation
predict_num.load_data(X=X_num, y=y)

## OLS

In [8]:
param_grid = {
    'fit_intercept': [True],  # List of possible values for n_neighbors
}
grid_search = predict_num.find_best_model(method="ols", param_grid=param_grid)
predict_num.estimate_test(grid_search, {'fit_intercept': True})
predict_num.summary_df

{'fit_intercept': True}
{'method': 'ols', 'MSE_train': -110835187849.13313, 'MSE_valid': -110950063750.01665, 'MAD_train': -167494.2588491664, 'MAD_valid': -167549.0298145045, 'R2_valid': 0.7299018414232207, 'R2_train': 0.7301729042022069, 'params': '{"fit_intercept": true}', 'iteration': 1, 'cv_n': 3, 'test_size': 0.09999999999999998}
Empty DataFrame
Columns: [method, MSE_train, MAD_train, R2_train, MSE_valid, MAD_valid, R2_valid, MSE_test, MAD_test, R2_test, params, test_size, cv_n, iteration]
Index: []


  self.summary_df = pd.concat([self.summary_df, pd.DataFrame(stat_row, index=[self.df_index])])


Unnamed: 0,method,MSE_train,MAD_train,R2_train,MSE_valid,MAD_valid,R2_valid,MSE_test,MAD_test,R2_test,params,test_size,cv_n,iteration
0,ols,-110835200000.0,-167494.258849,0.730173,-110950100000.0,-167549.029815,0.729902,114249315327.1947,168154.31387,0.729155,"{""fit_intercept"": true}",0.1,3,1


## K-nearest Neighbors

In [9]:
param_grid = {
    'n_neighbors': [3, 7, 11],  # List of possible values for n_neighbors
}
grid_search_knreg = predict_num.find_best_model(
    method="knreg", param_grid=param_grid, n_jobs=5
    )
predict_num.summary_df
predict_num.estimate_test(grid_search_knreg, {"n_neighbors": 3})
predict_num.summary_df

{'n_neighbors': 3}
{'method': 'knreg', 'MSE_train': -18746012575.120937, 'MSE_valid': -39978290516.27935, 'MAD_train': -67738.78998082389, 'MAD_valid': -99507.14618756752, 'R2_valid': 0.9026563348313258, 'R2_train': 0.9543592446580654, 'params': '{"n_neighbors": 3}', 'iteration': 1, 'cv_n': 3, 'test_size': 0.09999999999999998}
  method     MSE_train      MAD_train  R2_train     MSE_valid      MAD_valid  \
0    ols -1.108352e+11 -167494.258849  0.730173 -1.109501e+11 -167549.029815   

   R2_valid             MSE_test      MAD_test   R2_test  \
0  0.729902  114249315327.194702  168154.31387  0.729155   

                    params  test_size cv_n iteration  
0  {"fit_intercept": true}        0.1    3         1  


Unnamed: 0,method,MSE_train,MAD_train,R2_train,MSE_valid,MAD_valid,R2_valid,MSE_test,MAD_test,R2_test,params,test_size,cv_n,iteration
0,ols,-110835200000.0,-167494.258849,0.730173,-110950100000.0,-167549.029815,0.729902,114249315327.1947,168154.31387,0.729155,"{""fit_intercept"": true}",0.1,3,1
1,knreg,-18746010000.0,-67738.789981,0.954359,-39978290000.0,-99507.146188,0.902656,35755528274.0893,94426.014204,0.915236,"{""n_neighbors"": 3}",0.1,3,1


## Summary Results

In [12]:
predict_num.summary_df

Unnamed: 0,method,MSE_train,MAD_train,R2_train,MSE_valid,MAD_valid,R2_valid,MSE_test,MAD_test,R2_test,params,test_size,cv_n,iteration
