In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from datetime import datetime

In [25]:
INPUT_DIR = '/content/drive/MyDrive/signate/Student Cup 2023/input'

df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
df

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
0,0,nashville,1949,bmw,excellent,6 cylinders,gas,115148,clean,manual,rwd,mid-size,convertible,orange,,27587
1,1,state college,2013,toyota,fair,8 cylinders,gas,172038,clean,automatic,rwd,full-size,sedan,silver,pa,4724
2,2,wichita,1998,ford,good,6 cylinders,gas,152492,clean,automatic,fwd,full-size,SUV,silver,ks,10931
3,3,albany,2014,ford,excellent,4 cylinders,gas,104118,clean,manual,fwd,mid-size,SUV,blue,ny,16553
4,4,redding,2005,ford,excellent,6 cylinders,gas,144554,clean,manual,fwd,mid-size,sedan,red,ca,5158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,27527,williamsport,2008,ford,good,6 cylinders,gas,26660,clean,automatic,rwd,compact,truck,black,pa,32212
27528,27528,tulsa,2007,ford,excellent,8 cylinders,gas,108072,clean,automatic,rwd,full-size,pickup,black,,5400
27529,27529,rochester,2019,jeep,like new,6 cylinders,gas,139908,clean,automatic,4wd,mid-size,SUV,white,ny,22227
27530,27530,rochester,2007,jeep,excellent,6 cylinders,gas,112326,clean,automatic,4wd,mid-size,sedan,white,ny,3054


In [27]:
# Identify categorical and numerical columns
categorical_cols = ['region', 'manufacturer', 'condition', 'cylinders', 'fuel',
                    'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state',
                    'state_region', 'condition_fuel', 'color_type'
                    ]
numerical_cols = ['year', 'odometer']

# Handle missing values and outliers
df['fuel'].fillna(df['fuel'].mode()[0], inplace=True)
df['title_status'].fillna(df['title_status'].mode()[0], inplace=True)
df['type'].fillna(df['type'].mode()[0], inplace=True)
df['state'].fillna(df['state'].mode()[0], inplace=True)

# df['fuel'].fillna(-999, inplace=True)
# df['title_status'].fillna(-999, inplace=True)
# df['type'].fillna(-999, inplace=True)
# df['state'].fillna(-999, inplace=True)

df['manufacturer'] = df['manufacturer'].str.lower()

current_year = datetime.now().year
df['age'] = current_year - df['year']

# 2. Number of cylinders
df['cylinders_num'] = df['cylinders'].str.extract('(\d+)').astype(float)
df['cylinders_num'].fillna(0, inplace=True)

# 3. Combined feature of state and region
df['state_region'] = df['state'] + '_' + df['region']

# Assuming df['age'] > 0 to avoid division by zero
df.loc[df['age'] == 0, 'age'] = 1

# 4. Odometer reading per year
df['odometer_per_year'] = df['odometer'] / df['age']

# 5. Combined feature of condition and fuel type
df['condition_fuel'] = df['condition'] + '_' + df['fuel']

# 6. Combined feature of color and type
df['color_type'] = df['paint_color'] + '_' + df['type']

df.loc[df['year'] > 2023, 'year'] = 2023

# label_encoder = LabelEncoder()
# for col in categorical_cols:
#     df[col] = label_encoder.fit_transform(df[col])

df = pd.get_dummies(df, columns=categorical_cols)

print(df.isnull().sum().sum())
# df

0


In [None]:
# Define predictors and target
X = df.drop(['id', 'price'], axis=1)
y = df['price']

# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline for preprocessing and model
pipeline = Pipeline([
    ('scaling', StandardScaler()), # Data normalization
    ('clf', RandomForestRegressor(random_state=0)) # Model
])

# Define a grid for hyperparameters tuning
param_grid = {
    'clf__n_estimators': [100, 200, 300], # Example of parameters to tune, you should adjust this according to your needs and computational capacity
    'clf__max_depth': [None, 10, 20], # Same here
}

# Define custom scorer
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape_scorer = make_scorer(mape, greater_is_better=False)

# Implement Grid Search CV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=mape_scorer, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Preprocessing of validation data, get predictions
preds = best_model.predict(X_val)

# Evaluate the model
mape_score = mape(y_val, preds)

mape_score





64.54905544365866

In [None]:
df_test = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
df_test