In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv("train.csv")
data = data.drop(['Unnamed: 0'], axis=1)

data.columns

Index(['engine_capacity', 'type', 'registration_year', 'gearbox', 'power',
       'model', 'mileage', 'fuel', 'brand', 'damage', 'zipcode',
       'insurance_price', 'price'],
      dtype='object')

In [4]:
data_test = pd.read_csv("test_no_target.csv")
data_test = data_test.drop(['Unnamed: 0'], axis=1)

cols_with_missing = [col for col in data.columns if data[col].isnull().any()]
cols_with_missing

['engine_capacity',
 'type',
 'gearbox',
 'model',
 'fuel',
 'damage',
 'insurance_price']

In [88]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

y = data['price']

X = data.drop(['price'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=0)

y_train = np.log(y_train)

imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))
imputed_test = pd.DataFrame(imputer.transform(data_test))

imputed_X_train.columns = X.columns
imputed_X_valid.columns = X.columns
imputed_test.columns = data_test.columns

def types_change(old_df, new_df):
    for i in old_df.columns:
        if i in new_df.columns:
            new_df[i] = new_df[i].astype(old_df[i].dtype)
            
types_change(X_train, imputed_X_train)
types_change(X_valid, imputed_X_valid)
types_change(data_test, imputed_test)

In [89]:
categorical = [col for col in X_train.columns if X_train[col].dtype == "object"]
categorical

['type', 'gearbox', 'model', 'fuel', 'brand']

In [90]:
same_labels = [col for col in categorical if set(X_valid[col]).issubset(set(X_train[col]))]

not_same_labels = list(set(categorical) - set(same_labels))
not_same_labels

['model']

In [91]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

label_X_train = imputed_X_train[same_labels]
label_X_valid = imputed_X_valid[same_labels]
label_data_test = imputed_test[same_labels]

non_categorical_X_train = imputed_X_train.drop(categorical, axis=1)
non_categorical_X_valid = imputed_X_valid.drop(categorical, axis=1)
non_categorical_data_test = imputed_test.drop(categorical, axis=1)

In [92]:
label_encoder = OrdinalEncoder()

label_X_train = pd.DataFrame(label_encoder.fit_transform(label_X_train))
label_X_valid = pd.DataFrame(label_encoder.transform(label_X_valid))
label_data_test = pd.DataFrame(label_encoder.transform(label_data_test))

label_X_train.index = imputed_X_train[same_labels].index
label_X_valid.index = imputed_X_valid[same_labels].index
label_data_test.index = imputed_test[same_labels].index

# avoid same cols
label_X_train.columns = imputed_X_train[same_labels].columns
label_X_valid.columns = imputed_X_valid[same_labels].columns
label_data_test.columns = imputed_test[same_labels].columns

In [93]:
oneHot_X_train = imputed_X_train[not_same_labels]
oneHot_X_valid = imputed_X_valid[not_same_labels]
oneHot_data_test = imputed_test[not_same_labels]

oneHot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

oneHot_X_train = pd.DataFrame(oneHot_encoder.fit_transform(oneHot_X_train))
oneHot_X_valid = pd.DataFrame(oneHot_encoder.transform(oneHot_X_valid))
oneHot_data_test = pd.DataFrame(oneHot_encoder.transform(oneHot_data_test))

oneHot_X_train.index = imputed_X_train[not_same_labels].index
oneHot_X_valid.index = imputed_X_valid[not_same_labels].index
oneHot_data_test.index = imputed_test[not_same_labels].index

In [94]:
final_X_train = pd.concat([non_categorical_X_train, label_X_train, oneHot_X_train], axis=1)
final_X_valid = pd.concat([non_categorical_X_valid, label_X_valid, oneHot_X_valid], axis=1)
final_data_test = pd.concat([non_categorical_data_test, label_data_test, oneHot_data_test], axis=1)
final_data_test.columns

Index([  'engine_capacity', 'registration_year',             'power',
                 'mileage',            'damage',           'zipcode',
         'insurance_price',              'type',           'gearbox',
                    'fuel',
       ...
                       235,                 236,                 237,
                       238,                 239,                 240,
                       241,                 242,                 243,
                       244],
      dtype='object', length=256)

In [95]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

def mape(Y_actual, predicted):
    mape = np.mean(np.abs((Y_actual - predicted)/Y_actual))*100
    return mape

def score_dataset(X_train, X_valid, y_train, y_valid, estimators):
    model = XGBRegressor(n_estimators = estimators, random_state=0, learning_rate=0.2)
    model.fit(X_train, y_train, verbose=False)
    predicts = model.predict(X_valid)
    data_submit = pd.read_csv("sample_submission.csv")
    data_submit["Predicted"] = np.exp(predicts)
    data_submit.to_csv("submit.csv", index=False)
    return 0#mape(y_valid, np.exp2(predicts))

In [96]:
for est in [430]:
    print(f'Estimators: {est} - Score: {score_dataset(final_X_train, final_data_test, y_train, y_valid, est)}')

Estimators: 430 - Score: 0
