In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import os
from datetime import datetime

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
INPUT_DIR = '/content/drive/MyDrive/signate/Student Cup 2023/input'

df = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
df

Unnamed: 0,id,region,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,price
0,0,nashville,1949,bmw,excellent,6 cylinders,gas,115148,clean,manual,rwd,mid-size,convertible,orange,,27587
1,1,state college,2013,toyota,fair,8 cylinders,gas,172038,clean,automatic,rwd,full-size,sedan,silver,pa,4724
2,2,wichita,1998,ford,good,6 cylinders,gas,152492,clean,automatic,fwd,full-size,SUV,silver,ks,10931
3,3,albany,2014,ford,excellent,4 cylinders,gas,104118,clean,manual,fwd,mid-size,SUV,blue,ny,16553
4,4,redding,2005,ford,excellent,6 cylinders,gas,144554,clean,manual,fwd,mid-size,sedan,red,ca,5158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27527,27527,williamsport,2008,ford,good,6 cylinders,gas,26660,clean,automatic,rwd,compact,truck,black,pa,32212
27528,27528,tulsa,2007,ford,excellent,8 cylinders,gas,108072,clean,automatic,rwd,full-size,pickup,black,,5400
27529,27529,rochester,2019,jeep,like new,6 cylinders,gas,139908,clean,automatic,4wd,mid-size,SUV,white,ny,22227
27530,27530,rochester,2007,jeep,excellent,6 cylinders,gas,112326,clean,automatic,4wd,mid-size,sedan,white,ny,3054


In [58]:
# Identify categorical and numerical columns
categorical_cols = ['region', 'manufacturer', 'condition', 'cylinders', 'fuel',
                    'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state',
                    'state_region', 'condition_fuel', 'color_type'
                    ]

In [59]:
# Define preprocessing function
def preprocess(df):
    # Fill missing values
    # df['fuel'].fillna(df['fuel'].mode()[0], inplace=True)
    # df['title_status'].fillna(df['title_status'].mode()[0], inplace=True)
    # df['type'].fillna(df['type'].mode()[0], inplace=True)
    # df['state'].fillna(df['state'].mode()[0], inplace=True)


    # # Label encoding for features with many unique values
    # le = LabelEncoder()
    # df['region'] = le.fit_transform(df['region'])
    # df['manufacturer'] = le.fit_transform(df['manufacturer'])

    # One-hot encoding for other categorical features
    # df = pd.get_dummies(df, columns=['condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state', 'region', 'manufacturer'])
    df['manufacturer'] = df['manufacturer'].str.lower()

    current_year = datetime.now().year
    df['age'] = current_year - df['year']

    # 2. Number of cylinders
    df['cylinders_num'] = df['cylinders'].str.extract('(\d+)').astype(float)
    df['cylinders_num'].fillna(0, inplace=True)

    # 3. Combined feature of state and region
    df['state_region'] = df['state'] + '_' + df['region']

    # Assuming df['age'] > 0 to avoid division by zero
    df.loc[df['age'] == 0, 'age'] = 1

    # 4. Odometer reading per year
    df['odometer_per_year'] = df['odometer'] / df['age']

    # 5. Combined feature of condition and fuel type
    df['condition_fuel'] = df['condition'] + '_' + df['fuel']

    # 6. Combined feature of color and type
    df['color_type'] = df['paint_color'] + '_' + df['type']

    df.loc[df['year'] > 2023, 'year'] = 2023

    return df

# Preprocess the data
df = preprocess(df)

df = pd.get_dummies(df, columns=categorical_cols)

# Prepare data for model training
X = df.drop(['price', 'id'], axis=1)
y = df['price']

# Initialize the model
model = lgb.LGBMRegressor()

X.columns = X.columns.str.replace('[<,>,:,\\,/,|,?,*,\x00-\x1F,\x7F]', '')

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=1)
mape_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate MAPE score
    mape_score = mean_absolute_percentage_error(y_test, predictions)
    mape_scores.append(mape_score)

# Calculate average MAPE score
average_mape_score = np.mean(mape_scores)
print(average_mape_score)
print("Average MAPE score:", average_mape_score)


  X.columns = X.columns.str.replace('[<,>,:,\\,/,|,?,*,\x00-\x1F,\x7F]', '')


0.6586001823299545
Average MAPE score: 0.6586001823299545
