In [2]:
import pandas as pd
import warnings
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import pickle

warnings.filterwarnings('ignore')

np.random.seed(123)

data = pd.read_csv('/Users/joelfoo/Documents/carouhack_car_price/data/sgcm-ads.csv')
data.columns

Index(['make model', 'rid', 'source_url', 'id', 'dl', 'make', 'model',
       'vehicle_type', 'car_registration_date', 'listing_price',
       'depreciation', 'category_1', 'engine_cap', 'transmission', 'mileage',
       'no_of_owners', 'coe', 'omv', 'arf', 'availability', 'posted_on',
       'last_updated_on', 'company_name', 'road_tax', 'category_2',
       'category_3'],
      dtype='object')

In [3]:
df = data[['make model', 'car_registration_date', 'coe', 'no_of_owners', 'mileage', 'arf', 'depreciation']]

In [4]:
# feature engineering

df['car_registration_date'] = pd.to_datetime(df['car_registration_date']) 
df['days_since_reg'] = (pd.to_datetime('today') - df['car_registration_date']).dt.days

df['mileage_per_year'] = df['mileage']/(df['days_since_reg']/365)
df['good_mileage'] = df['mileage_per_year']/1000 <= 15

In [56]:
# limit dataset to top 10 cars
top_10 = ['Honda Vezel 1.5A X',
          'Toyota Corolla Altis 1.6A',
          'Volkswagen Golf 1.4A TSI',
          'Toyota Wish 1.8A', 
          'BMW 5 Series 520i',
          'Mazda 5 2.0A Sunroof',
          'Volkswagen Jetta 1.4A TSI',
          'Volkswagen Scirocco 1.4A TSI',
          'Audi A4 1.8A TFSI MU',
          'Mercedes-Benz C-Class C180 Avantgarde'
         ]


df = df[(df['make model'].isin(top_10))].dropna()

model_dict = {}
for m in top_10:
    model_dict[m] = {"lower": None,
                     "upper": None,
                    "predicted": None}

55094.5 59177.0


In [68]:
state = np.random.seed(123)

for make_model in top_10:
    df_m = df[df['make model'] == make_model]
    
    mileage_processed = []
    for i in range(len(df_m['mileage'])):
        mileage = df_m['mileage'].iloc[i]
        if  mileage == 0:
            mileage_processed.append(df_m['days_since_reg'].iloc[i] / 365 * 15000)
        else:
            mileage_processed.append(mileage)

    df_m['mileage'] = mileage_processed
    
    train, test = train_test_split(df_m, test_size=0.3)

    coeff_list = ['coe', 'no_of_owners', 'arf', 'mileage', 'good_mileage', 'days_since_reg']

    x_train = train[coeff_list]
    y_train = train[['depreciation']]

    x_test = test[coeff_list]
    y_test= test[['depreciation']]
    
    #print(make_model)
    #print(y_train)
    
    # GradientBoost
    r0 = GradientBoostingRegressor(loss='quantile', alpha=0.95,
                                   n_estimators= 10,
                                   max_features = 'auto',
                                   random_state=state)
    r0.fit(x_train, y_train)
    #model_dict[make_model]["upper"] = pickle.dumps(r0)
    y_upper = r0.predict(x_test)

    r0.set_params(alpha=1-0.95)
    r0.fit(x_train,y_train)
    #model_dict[make_model]["lower"] = pickle.dumps(r0)
    y_lower = r0.predict(x_test)

    r0.set_params(loss='ls')
    r0.fit(x_train, y_train)
    #model_dict[make_model]["predicted"] = pickle.dumps(r0)
    y_pred0 = r0.predict(x_test)


    y_test['lower'] = y_lower
    y_test['pred'] = y_pred0
    y_test['upper'] = y_upper
    
    y_test

In [7]:
f = open('/Users/joelfoo/Documents/carouhack_car_price/data/model.pkl', 'wb')
pickle.dump(model_dict, f)
f.close()

In [8]:
model_dict = pd.read_pickle('/Users/joelfoo/Documents/carouhack_car_price/data/model.pkl')

In [10]:
x_train.iloc[0:1]

Unnamed: 0,coe,no_of_owners,arf,mileage,good_mileage,days_since_reg
3254,49802,1,44135,23280,False,476


In [45]:
x_new = pd.DataFrame(np.array([123,
                               5, 2, 3, True,
                                 5]).reshape(1,6), index=np.array(range(1, 2)), columns = coeff_list)

In [49]:
x_new = pd.DataFrame(np.array([123,
                                 2, 231, 15125, True,
                                 1123]).reshape(1, 6),
                                 index=np.array(range(1, 2)),
                                 columns=coeff_list)

In [50]:
x_new

Unnamed: 0,coe,no_of_owners,arf,mileage,good_mileage,days_since_reg
1,123,2,231,15125,1,1123


In [54]:
print(pickle.loads(model_dict["Toyota Wish 1.8A"]["lower"]).predict(x_new))
print(pickle.loads(model_dict["Toyota Wish 1.8A"]["predicted"]).predict(x_new))
print(pickle.loads(model_dict["Toyota Wish 1.8A"]["upper"]).predict(x_new))

[8483.52963603]
[8559.35888768]
[11262.21448834]


In [67]:
df_m = df[df['make model'] == 'Toyota Corolla Altis 1.6A']


    
#Mask 0 mileage with 15k * num years & coe with median

df_m

Unnamed: 0,make model,car_registration_date,coe,no_of_owners,mileage,arf,depreciation,days_since_reg,mileage_per_year,good_mileage
5050,Toyota Corolla Altis 1.6A,2017-05-31,51106,1,32500,19990,9280.0,547,21686.471664,False
5051,Toyota Corolla Altis 1.6A,2017-04-13,51765,1,36000,19990,8710.0,595,22084.033613,False
5052,Toyota Corolla Altis 1.6A,2017-03-13,48000,1,19800,17982,8190.0,626,11544.728435,True
5053,Toyota Corolla Altis 1.6A,2017-02-16,48401,1,10000,19990,9220.0,651,5606.758833,True
5054,Toyota Corolla Altis 1.6A,2017-02-06,50991,2,21000,16800,8310.0,661,11596.066566,True
5055,Toyota Corolla Altis 1.6A,2016-12-16,48000,1,39888,17982,7430.0,713,20419.523142,False
5056,Toyota Corolla Altis 1.6A,2016-12-16,48000,1,56000,19990,9040.0,713,28667.601683,False
5057,Toyota Corolla Altis 1.6A,2016-12-15,48000,1,20000,19990,9040.0,714,10224.089636,True
5058,Toyota Corolla Altis 1.6A,2016-12-13,52668,1,21000,19990,8330.0,716,10705.307263,True
5059,Toyota Corolla Altis 1.6A,2016-08-02,49000,2,17000,16800,7990.0,849,7308.598351,True


In [61]:
len(df_m['mileage'])

46