In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
import pandas_profiling as pp

# models
from sklearn.linear_model import LinearRegression, SGDRegressor, RidgeCV, LogisticRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor 
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, VotingRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
import sklearn.model_selection
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# model tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

import warnings
warnings.filterwarnings("ignore")


In [38]:
valid_part = 0.3
pd.set_option('max_columns',100)

In [39]:
train0 = pd.read_csv('vehicles.csv')
train0.head()

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,vin,drive,size,type,paint_color,image_url,description,county,state,lat,long
0,7119256118,https://mohave.craigslist.org/ctd/d/lake-havas...,mohave county,https://mohave.craigslist.org,3495,2012.0,jeep,patriot,like new,4 cylinders,gas,,clean,automatic,,,,,silver,https://images.craigslist.org/00B0B_k2AXIJ21ok...,"THIS 2012 JEEP PATRIOT IS A 4CYL. AC, STEREO, ...",,az,34.4554,-114.269
1,7120880186,https://oregoncoast.craigslist.org/cto/d/warre...,oregon coast,https://oregoncoast.craigslist.org,13750,2014.0,bmw,328i m-sport,good,,gas,76237.0,clean,automatic,,rwd,,sedan,grey,https://images.craigslist.org/00U0U_3cLk0WGOJ8...,Selling my 2014 BMW 328i with the following be...,,or,46.1837,-123.824
2,7115048251,https://greenville.craigslist.org/cto/d/sparta...,greenville / upstate,https://greenville.craigslist.org,2300,2001.0,dodge,caravan,excellent,6 cylinders,gas,199000.0,clean,automatic,,,,,,https://images.craigslist.org/00k0k_t4WqYn5nDC...,"01 DODGE CARAVAN,3.3 ENGINE,AUT TRANS,199000 M...",,sc,34.9352,-81.9654
3,7119250502,https://mohave.craigslist.org/cto/d/lake-havas...,mohave county,https://mohave.craigslist.org,9000,2004.0,chevrolet,colorado ls,excellent,5 cylinders,gas,54000.0,clean,automatic,1GCCS196448191644,rwd,mid-size,pickup,red,https://images.craigslist.org/00J0J_lJEzfeVLHI...,"2004 Chevy Colorado LS, ONLY 54000 ORIGINAL MI...",,az,34.4783,-114.271
4,7120433904,https://maine.craigslist.org/ctd/d/searsport-t...,maine,https://maine.craigslist.org,0,2021.0,,Honda-Nissan-Kia-Ford-Hyundai-VW,,,other,,clean,other,,,,,,https://images.craigslist.org/01010_j0IW34mCsm...,CALL: 207.548.6500 TEXT: 207.407.5598 **WE FI...,,me,44.4699,-68.8963


In [40]:
drop_columns = ['id','url', 'region', 'region_url', 'model', 'title_status', 'vin', 'size', 'image_url', 'description', 'lat','long','county']
train0 = train0.drop(columns = drop_columns)

In [41]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435849 entries, 0 to 435848
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         435849 non-null  int64  
 1   year          434732 non-null  float64
 2   manufacturer  415102 non-null  object 
 3   condition     249043 non-null  object 
 4   cylinders     269465 non-null  object 
 5   fuel          432858 non-null  object 
 6   odometer      360701 non-null  float64
 7   transmission  433703 non-null  object 
 8   drive         313838 non-null  object 
 9   type          318741 non-null  object 
 10  paint_color   300602 non-null  object 
 11  state         435849 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 39.9+ MB


In [42]:
train0 = train0.dropna()
train0.head()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
3,9000,2004.0,chevrolet,excellent,5 cylinders,gas,54000.0,automatic,rwd,pickup,red,az
7,8500,2005.0,ford,excellent,6 cylinders,gas,62800.0,automatic,rwd,convertible,silver,me
12,2750,2006.0,chevrolet,good,8 cylinders,gas,165000.0,automatic,4wd,truck,white,me
19,24930,2017.0,subaru,excellent,4 cylinders,gas,32989.0,automatic,4wd,SUV,grey,mt
26,3200,1998.0,volkswagen,good,4 cylinders,gas,98186.0,manual,rwd,coupe,blue,mt


In [43]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = train0.columns.values.tolist()
for col in features:
    if train0[col].dtype in numerics: continue
    categorical_columns.append(col)
# Encoding categorical features
for col in categorical_columns:
    if col in train0.columns:
        le = LabelEncoder()
        le.fit(list(train0[col].astype(str).values))
        train0[col] = le.transform(list(train0[col].astype(str).values))



In [44]:
train0['year'] = (train0['year']-1900).astype(int)
train0['odometer'] = train0['odometer'].astype(int)

In [45]:
train0.head()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
3,9000,104,7,0,4,2,54000,0,2,8,8,3
7,8500,105,13,0,5,2,62800,0,2,2,9,21
12,2750,106,7,2,6,2,165000,0,0,10,10,21
19,24930,117,37,0,3,2,32989,0,0,0,5,26
26,3200,98,40,2,3,2,98186,1,2,3,1,26


In [46]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124793 entries, 3 to 435846
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   price         124793 non-null  int64
 1   year          124793 non-null  int32
 2   manufacturer  124793 non-null  int64
 3   condition     124793 non-null  int64
 4   cylinders     124793 non-null  int64
 5   fuel          124793 non-null  int64
 6   odometer      124793 non-null  int32
 7   transmission  124793 non-null  int64
 8   drive         124793 non-null  int64
 9   type          124793 non-null  int64
 10  paint_color   124793 non-null  int64
 11  state         124793 non-null  int64
dtypes: int32(2), int64(10)
memory usage: 11.4 MB


In [47]:
train0['price'].value_counts()

0        4552
4500     1403
5995     1358
3500     1322
6995     1316
         ... 
5472        1
19755       1
7457        1
7393        1
19734       1
Name: price, Length: 5547, dtype: int64

In [48]:
train0 = train0[train0['price'] > 1000]
train0 = train0[train0['price'] < 40000]
# Rounded ['odometer'] to 5000
train0['odometer'] = train0['odometer'] // 5000
train0 = train0[train0['year'] > 110]

In [49]:
train0.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60870 entries, 19 to 435840
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   price         60870 non-null  int64
 1   year          60870 non-null  int32
 2   manufacturer  60870 non-null  int64
 3   condition     60870 non-null  int64
 4   cylinders     60870 non-null  int64
 5   fuel          60870 non-null  int64
 6   odometer      60870 non-null  int32
 7   transmission  60870 non-null  int64
 8   drive         60870 non-null  int64
 9   type          60870 non-null  int64
 10  paint_color   60870 non-null  int64
 11  state         60870 non-null  int64
dtypes: int32(2), int64(10)
memory usage: 5.6 MB


In [50]:
train0.corr()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
price,1.0,0.451994,-0.028563,0.11513,0.500969,-0.181776,-0.242929,0.182608,-0.245597,0.034775,0.042705,0.004828
year,0.451994,1.0,0.039877,0.133841,-0.096874,0.065522,-0.362668,0.093475,-0.020978,0.012367,0.042515,-0.014672
manufacturer,-0.028563,0.039877,1.0,-0.00877,-0.200279,-0.056015,-0.030149,0.042601,-0.099195,0.050371,-0.018589,-0.006306
condition,0.11513,0.133841,-0.00877,1.0,0.053672,0.030514,-0.071651,0.14354,0.033947,0.027081,0.002252,-0.000423
cylinders,0.500969,-0.096874,-0.200279,0.053672,1.0,-0.098038,0.071667,0.102699,-0.141777,0.072877,0.042781,0.021634
fuel,-0.181776,0.065522,-0.056015,0.030514,-0.098038,1.0,-0.10445,0.08776,0.081593,-0.119522,-0.049588,-0.03295
odometer,-0.242929,-0.362668,-0.030149,-0.071651,0.071667,-0.10445,1.0,-0.114799,-0.031043,0.034483,0.020085,0.014536
transmission,0.182608,0.093475,0.042601,0.14354,0.102699,0.08776,-0.114799,1.0,0.045928,0.002393,-0.019678,-0.024077
drive,-0.245597,-0.020978,-0.099195,0.033947,-0.141777,0.081593,-0.031043,0.045928,1.0,0.124981,0.075445,-0.087186
type,0.034775,0.012367,0.050371,0.027081,0.072877,-0.119522,0.034483,0.002393,0.124981,1.0,0.087766,-0.002102


In [51]:
train0.describe()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
count,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0
mean,15983.185313,114.283177,18.993264,1.152012,4.398899,1.927567,16.134483,0.159619,0.720831,6.113603,5.709479,24.10023
std,8397.447147,2.304149,11.495406,1.234212,1.276746,0.530551,15.60543,0.502718,0.731753,4.154347,4.06267,15.019661
min,1061.0,111.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9000.0,112.0,10.0,0.0,3.0,2.0,8.0,0.0,0.0,0.0,1.0,9.0
50%,14250.0,114.0,14.0,0.0,5.0,2.0,15.0,0.0,1.0,8.0,8.0,23.0
75%,21395.0,116.0,31.0,2.0,5.0,2.0,22.0,0.0,1.0,9.0,10.0,37.0
max,39999.0,121.0,41.0,5.0,7.0,4.0,1629.0,2.0,2.0,12.0,11.0,50.0


In [52]:
#pp.ProfileReport(train0)

In [53]:
target_name = 'price'
train_target0 = train0[target_name]
train0 = train0.drop([target_name], axis=1)

In [54]:
#train0, test0, train_target0, test_target0 = train_test_split(train0, train_target0, test_size=0.2, random_state=0)

In [55]:
# For boosting model
#train0b = train0
#train_target0b = train_target0
# Synthesis valid as test for selection models
#trainb, testb, targetb, target_testb = train_test_split(train0b, train_target0b, test_size=valid_part, random_state=0)

In [56]:
train0.describe()

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
count,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0,60870.0
mean,114.283177,18.993264,1.152012,4.398899,1.927567,16.134483,0.159619,0.720831,6.113603,5.709479,24.10023
std,2.304149,11.495406,1.234212,1.276746,0.530551,15.60543,0.502718,0.731753,4.154347,4.06267,15.019661
min,111.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,112.0,10.0,0.0,3.0,2.0,8.0,0.0,0.0,0.0,1.0,9.0
50%,114.0,14.0,0.0,5.0,2.0,15.0,0.0,1.0,8.0,8.0,23.0
75%,116.0,31.0,2.0,5.0,2.0,22.0,0.0,1.0,9.0,10.0,37.0
max,121.0,41.0,5.0,7.0,4.0,1629.0,2.0,2.0,12.0,11.0,50.0


In [57]:
scaler = StandardScaler()
train0 = pd.DataFrame(scaler.fit_transform(train0), columns = train0.columns)

In [58]:
import pickle
pickle.dump(scaler,open('preprocess_scale','wb'))

In [59]:
train0.head()

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
0,1.17911,1.566442,-0.933407,-1.095685,0.136525,-0.649426,-0.317514,-0.985082,-1.471628,-0.174635,0.126487
1,-0.556903,1.04449,-0.933407,-1.095685,0.136525,-0.457183,-0.317514,-0.985082,-1.471628,0.809947,0.126487
2,-0.990906,-1.652262,0.687074,0.470811,0.136525,0.63219,-0.317514,-0.985082,-1.471628,-1.405363,0.858866
3,-0.556903,-0.521366,0.687074,0.470811,0.136525,0.568109,-0.317514,-0.985082,0.935509,-0.174635,0.858866
4,-0.990906,1.305466,0.687074,1.254058,0.136525,0.247705,-0.317514,-0.985082,0.935509,-1.405363,0.858866


In [60]:
train, test, target, target_test = train_test_split(train0, train_target0, test_size=valid_part, random_state=0)

In [61]:
train.head()

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
31884,1.17911,-1.043318,0.687074,1.254058,0.136525,0.247705,-0.317514,-0.985082,0.213368,1.056093,-1.338272
10469,-0.990906,-1.13031,0.687074,1.254058,0.136525,0.888513,-0.317514,1.748102,-1.471628,-1.405363,-1.005372
34611,-0.1229,-1.043318,-0.933407,1.254058,0.136525,1.144836,-0.317514,1.748102,0.454082,1.056093,-1.338272
55300,0.745107,-1.304294,-0.933407,-1.095685,0.136525,-0.777587,-0.317514,-0.985082,0.694795,0.563802,-0.539313
14261,1.613113,0.522538,-0.933407,-1.095685,0.136525,-0.585345,-0.317514,-0.985082,-0.508773,-1.405363,-0.739053


In [62]:
test.head()

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
15504,2.481119,-0.173398,1.497314,-1.095685,0.136525,-1.03391,-0.317514,0.38151,0.694795,1.056093,0.725706
50278,-0.1229,-1.217302,1.497314,0.470811,0.136525,-0.072699,-0.317514,0.38151,-1.471628,0.809947,-1.005372
57733,-1.424909,-1.304294,1.497314,0.470811,0.136525,0.824432,3.660892,-0.985082,0.694795,-1.405363,1.391506
12605,-0.1229,-0.521366,0.687074,1.254058,0.136525,1.144836,-0.317514,-0.985082,-1.471628,1.056093,-0.539313
7518,1.17911,1.566442,-0.933407,-1.095685,0.136525,-0.777587,-0.317514,-0.985082,1.416937,-1.159217,-0.339573


In [63]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42609 entries, 31884 to 2732
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          42609 non-null  float64
 1   manufacturer  42609 non-null  float64
 2   condition     42609 non-null  float64
 3   cylinders     42609 non-null  float64
 4   fuel          42609 non-null  float64
 5   odometer      42609 non-null  float64
 6   transmission  42609 non-null  float64
 7   drive         42609 non-null  float64
 8   type          42609 non-null  float64
 9   paint_color   42609 non-null  float64
 10  state         42609 non-null  float64
dtypes: float64(11)
memory usage: 3.9 MB


In [64]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18261 entries, 15504 to 20226
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          18261 non-null  float64
 1   manufacturer  18261 non-null  float64
 2   condition     18261 non-null  float64
 3   cylinders     18261 non-null  float64
 4   fuel          18261 non-null  float64
 5   odometer      18261 non-null  float64
 6   transmission  18261 non-null  float64
 7   drive         18261 non-null  float64
 8   type          18261 non-null  float64
 9   paint_color   18261 non-null  float64
 10  state         18261 non-null  float64
dtypes: float64(11)
memory usage: 1.7 MB


In [65]:
def acc_model(num,model,train,test):
    # Calculation of accuracy of model акщь Sklearn by different metrics   
  
    global acc_train_r2, acc_test_r2, acc_train_d, acc_test_d, acc_train_rmse, acc_test_rmse
    
    ytrain = model.predict(train)  
    ytest = model.predict(test)

    print('target = ', target[:5].values)
    print('ytrain = ', ytrain[:5])

    acc_train_r2_num = round(r2_score(target, ytrain) * 100, 2)
    print('acc(r2_score) for train =', acc_train_r2_num)   

    acc_train_d_num = round(acc_d(target, ytrain) * 100, 2)
    print('acc(relative error) for train =', acc_train_d_num)   

    acc_train_rmse_num = round(acc_rmse(target, ytrain) * 100, 2)
    print('acc(rmse) for train =', acc_train_rmse_num)   

    print('target_test =', target_test[:5].values)
    print('ytest =', ytest[:5])
    
    acc_test_r2_num = round(r2_score(target_test, ytest) * 100, 2)
    print('acc(r2_score) for test =', acc_test_r2_num)
    
    acc_test_d_num = round(acc_d(target_test, ytest) * 100, 2)
    print('acc(relative error) for test =', acc_test_d_num)
    
    acc_test_rmse_num = round(acc_rmse(target_test, ytest) * 100, 2)
    print('acc(rmse) for test =', acc_test_rmse_num)
def acc_d(y_meas, y_pred):
    # Relative error between predicted y_pred and measured y_meas values
    return mean_absolute_error(y_meas, y_pred)*len(y_meas)/sum(abs(y_meas))

def acc_rmse(y_meas, y_pred):
    # RMSE between predicted y_pred and measured y_meas values
    return (mean_squared_error(y_meas, y_pred))**0.5

In [66]:
linear_svr = LinearSVR()
linear_svr.fit(train, target)
acc_model(2,linear_svr,train,test)

target =  [27900 15995 12500 19900 18000]
ytrain =  [21182.66829678 10665.06169264 12344.25643979 14112.71788487
 15922.63044157]
acc(r2_score) for train = 46.4
acc(relative error) for train = 27.06
acc(rmse) for train = 615790.61
target_test = [18499 16995  9830 10295 21495]
ytest = [17667.70271832 14306.64451494 12919.37265473 15923.22992929
 15275.25506238]
acc(r2_score) for test = 40.61
acc(relative error) for test = 26.91
acc(rmse) for test = 644708.1


In [67]:
pickle.dump(linear_svr,open('linear_svr','wb'))