In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score

In [None]:
def load_data():
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    return train_df, test_df

In [None]:
train_df, test_df = load_data()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
#BRAND_SCORE
scaler = StandardScaler()
scaled_score = scaler.fit_transform(train_df.groupby('brand').mean()['price'].values.reshape(-1, 1)).ravel()
brand_score = pd.DataFrame()
brand_score['brand'] = train_df.groupby('brand').mean()['price'].index
brand_score['score'] = scaled_score
brand_score.head()

In [None]:
#MODEL_SCORE
scaler = StandardScaler()
scaled_score = scaler.fit_transform(train_df.groupby('model').mean()['price'].values.reshape(-1, 1)).ravel()
model_score = pd.DataFrame()
model_score['model'] = train_df.groupby('model').mean()['price'].index
model_score['score2'] = scaled_score
model_score.head()

In [None]:
#LOCATION_SCORE
scaler = StandardScaler()

# Data Cleaning (missing values, outlier)

In [None]:
def clean_data(df):
    clean = df.copy()
    
    # CLEAN MILEAGE
    def fix_(x):
        try:
            x = int(x)
        except:
            x = np.nan
        return x
    clean['mileage'] = clean['mileage'].apply(lambda x: fix_(x))    
    
    # FILL MISSING VALUES
    transmission_mode = clean['transmission'].mode()
    color_mode = clean['color'].mode()
    clean['desc'] = clean['desc'].fillna('')     
    clean['transmission'] = clean['transmission'].fillna(transmission_mode)
    clean['color'] = clean['color'].fillna(color_mode)
    clean['mileage'] = clean['mileage'].fillna(clean['mileage'].mode()[0])
    return clean

In [None]:
train_data = clean_data(train_df)
test_data = clean_data(test_df)

In [None]:
train_data.head()

# Feature Engineering

In [None]:
def extract_feature(df):
    feat = df.copy()
    
    #  TIMESTAMP
    feat['timestamp'] = feat['timestamp'].apply(lambda x: datetime.datetime.fromtimestamp(x))
    # NEW NEW TIMESTAMP_YEAR COLUMNS
    feat['timestamp_year'] = feat['timestamp'].apply(lambda x: x.year)
    # CAR AGE COLUMNS
    feat['age'] = feat['timestamp_year'] - feat['year']
    feat = feat[feat['age']>=0]
    
    # CLEAR MILEAGE OUTLIER, FILL MISSING VALUES
    q1 = feat['mileage'].quantile(0.25)
    q3 = feat['mileage'].quantile(0.75)
    IQR = q3-q1
    upper_limit = q3+1.5*IQR
    lower_limit = q1-1.5*IQR
    outliers = (feat['mileage']>upper_limit) | (feat['mileage']<lower_limit)
    feat = feat[~outliers]
    
#     nan_rows = feat['mileage'].isna()
#     random_values = feat['mileage'][~nan_rows].values
#     np.random.seed(0)
#     random_mileage = np.random.choice(random_values, replace=True, size = sum(nan_rows))
#     feat.loc[nan_rows, 'mileage'] = random_mileage
        
    # DESCRIPTION LENGTH
    feat['length_desc'] = feat['desc'].apply(lambda x: len(str(x)))
    
    # MILEAGE PER YEAR
    feat['mileage_per_year'] = feat['mileage']/feat['age']
    feat['mileage_per_year'] = feat['mileage_per_year'].apply(lambda x: 0 if (x==np.inf) else x).fillna(0)
    
    #LOCATION
    def sort_location(x):
        if x not in ['กรุงเทพมหานคร']:
            x = 'Other City'
        return x
    feat['location'] = feat['location'].apply(lambda x: sort_location(x))
    
    return feat

In [None]:
train_feature = extract_feature(train_data)
test_feature = extract_feature(test_data)

In [None]:
train_feature['mileage'].hist()

In [None]:
# BRAND SCORE (BE AWARE OF DATA LEAKAGE !!)
train_feature = pd.merge(train_feature, brand_score, how='inner')
test_feature = pd.merge(test_feature, brand_score, how='inner')

In [None]:
# # MODEL SCORE (BE AWARE OF DATA LEAKAGE !!)
# train_feature = pd.merge(train_feature, model_score, how='inner')
# test_feature = pd.merge(test_feature, model_score, how='inner')

In [None]:
# NORMALIZE DATA
scaler = MinMaxScaler()
train_feature[['mileage','age','mileage_per_year']] = scaler.fit_transform(train_feature[['mileage','age','mileage_per_year']])
test_feature[['mileage','age','mileage_per_year']] = scaler.transform(test_feature[['mileage','age','mileage_per_year']])

In [None]:
train_feature.head(2)

In [None]:
def get_feature(df, categorical_data, numerical_data):
    get_feat = df.copy()
    get_feat = get_feat[numerical_data + categorical_data]
    #CATEGORICAL DATA
    get_feat = pd.get_dummies(get_feat, columns=categorical_data)
    return get_feat

In [None]:
categorical_data = ['location','car_type']
numerical_data = ['price','mileage','timestamp_year','mileage_per_year','score','age']
train_feature_ = get_feature(train_feature, categorical_data, numerical_data)
test_feature_ = get_feature(test_feature, categorical_data, numerical_data)

In [None]:
train_feature_

# Train Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression

In [None]:
X = train_feature_.drop('price', axis=1)
y = train_feature_['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_test = test_feature_.drop('price', axis=1)
y_test = test_feature_['price']

In [None]:
def train_model(X_train, y_train, learning_rate, n_estimators, max_depth):
    model =GradientBoostingRegressor(random_state=0, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train, y_train)
    return model

In [None]:
model = train_model(X_train, y_train,0.1,100,7)

In [None]:
prediction = model.predict(X_val)

In [None]:
def eval_acc(y_val, prediction):
    acc = mean_squared_error(y_val, prediction)
    acc2 = r2_score(y_val, prediction)
    return (acc, acc2)

In [None]:
def pipeline(learning_rate=0.1, n_estimators=100, max_depth=7):
    model = train_model(X_train, y_train, learning_rate, n_estimators, max_depth)
    prediction = model.predict(X_val)
    acc = eval_acc(y_val, prediction)
    
    model = train_model(X, y, learning_rate, n_estimators, max_depth)
    prediction = model.predict(X_test)
    acc2 = eval_acc(y_test, prediction)
    
    return acc, acc2, model

In [None]:
acc, acc2, model = pipeline(learning_rate=0.1, n_estimators=50, max_depth=4)
print('Train dataset Validation')
print('RMSE: ', np.sqrt(acc[0]))
print('R-Squared: ', acc[1])
print('-----------------------------------------------')
print('Test dataset Validation')
print('RMSE: ', np.sqrt(acc2[0]))
print('R-Squared: ', acc2[1])

In [None]:
feat_imp = pd.DataFrame([X_train.columns, model.feature_importances_]).T
feat_imp.columns = ['feature','importance']
feat_imp.sort_values(by='importance', ascending=False)

In [None]:
acc_array = []
for i in range(0,15):
    acc, acc2, model = pipeline(learning_rate=0.1, n_estimators=50, max_depth=i+1)
    acc_array.append(acc[1])
plt.figure(figsize=(10,6))
plt.plot(range(1,16), acc_array)
plt.xlabel('Max Depth')
plt.ylabel('R-Squared')

In [None]:
acc_array = []
for i in [10,50,100,250,500,1000]:
    acc, acc2, model = pipeline(learning_rate=0.1, n_estimators=i, max_depth=4)
    acc_array.append(acc[1])
plt.figure(figsize=(10,6))
plt.plot([10,50,100,250,500,1000], acc_array)
plt.xlabel('N_estimators')
plt.ylabel('R-Squared')

# Recursive Feature Elimination with Cross-Validation

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
est = GradientBoostingRegressor(random_state=0, learning_rate=0.1, n_estimators=100, max_depth=7)
rfecv = RFECV(estimator=est, step=1, scoring='neg_mean_squared_error')
rfecv.fit(X_train, y_train)

In [None]:
print('Optimal Number of Features: ', rfecv.n_features_)

In [None]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
rfecv_df = pd.DataFrame()
rfecv_df['feature'] = X_train.columns
rfecv_df['ranking'] = rfecv.ranking_
rfecv_df.sort_values(by='ranking').head(5)