In [1]:
import numpy as np
import pandas as pd

from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, IsolationForest
from sklearn.metrics import r2_score, f1_score, mean_squared_error
from sklearn.neighbors import LocalOutlierFactor
# from xgboost import XGBRegressor

In [None]:

X_test = pd.read_csv('../input/used-car-price-dataset-competition-format/X_test.csv')
X_train = pd.read_csv('../input/used-car-price-dataset-competition-format/X_train.csv')
y_train = pd.read_csv('../input/used-car-price-dataset-competition-format/y_train.csv')

#print(help(mean_squared_error))
#print(y_train.info())
#print(X_train.isnull().sum())

# for col in X_train.select_dtypes(include='object'):
#     print(col, list(X_test[col].unique()) == list(X_test[col].unique()))
    #print(X_train[col].value_counts())

X_TEST = X_test.drop(['carID'], axis=1)
X = X_train.drop(['carID'], axis=1)
Y = y_train['price']

object_cols = X_TEST.select_dtypes(include='object').columns

for col in object_cols:
    X_TEST[col] = LabelEncoder().fit_transform(X_TEST[col])
    X[col] = LabelEncoder().fit_transform(X[col])

    
not_object_cols = X.select_dtypes(exclude='object').columns

# Outlier
clf = LocalOutlierFactor(n_neighbors=5)
outlier = clf.fit_predict(X[not_object_cols])
#print(help(clf))
X = X.loc[outlier > 0]
Y = Y.loc[outlier > 0]

skew_features_x = set([index for index, value in X[not_object_cols].apply(skew).items() if value > 1])
skew_features_x_test = set([index for index, value in X_TEST[not_object_cols].apply(skew).items() if value > 1])
skew_features = skew_features_x & skew_features_x_test

for col in skew_features:
    X[col] = StandardScaler().fit(X[[col]]).transform(X[[col]])
    X_TEST[col] = StandardScaler().fit_transform(X_TEST[[col]])
    

X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X, Y, test_size=0.2, random_state=1002)

param = {'n_estimators':100, 'random_state':42}
algos = {'rf': RandomForestRegressor(**param), 
            'gb': GradientBoostingRegressor(**param), 
            'ada': AdaBoostRegressor(**param), 
            'xgb': XGBRegressor(eval_metric='error', **param)}

def modeling(name, algo) :
    y_pred = algo.fit(X_TRAIN, Y_TRAIN).predict(X_VALID)
    ele = (name, algo, r2_score(Y_VALID, y_pred), mean_squared_error(Y_VALID, y_pred, squared=True), mean_squared_error(Y_VALID, y_pred, squared=False))
    print('name:{0} r2:{2} mse:{3} rmse:{4}'.format(*ele))
    return ele

results = sorted([modeling(name, algo) for name, algo in algos.items()],key=lambda tup: tup[2], reverse=True)
results.sort(key=lambda tup: tup[2], reverse=True)
test_pred = results[0][1].predict(X_TEST)
result = pd.DataFrame({'carID': X_test['carID'],'price': test_pred})
result.to_csv('000000.csv', index=False)

y_test = pd.read_csv('../input/used-car-price-dataset-competition-format/test_label/y_test.csv')
score = r2_score(y_test['price'], test_pred)
mse = mean_squared_error(y_test['price'], test_pred, squared=True)
rmse = mean_squared_error(y_test['price'], test_pred, squared=False)
print('final name: {0} r2:{1} mse:{2} rmse:{3}'.format(results[0][0],score, mse, rmse))

final = pd.read_csv('./000000.csv')
print(final.head(2))
