In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [22]:
train_data = pd.read_csv('train.csv', delimiter=',', dtype=str)
test_data = pd.read_csv('test.csv', delimiter=',', dtype=str)
train_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [23]:
mappingTitle = {'Yes':1, 'No':0}
mappingAccident = {'None reported':0, 'At least 1 accident or damage reported':1}
train_data['clean_title'] = train_data['clean_title'].map(mappingTitle)
test_data['clean_title'] = test_data['clean_title'].map(mappingTitle)
train_data['accident'] = train_data['accident'].map(mappingAccident)
test_data['accident'] = test_data['accident'].map(mappingAccident)
train_data.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,0,1,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,0,1,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,0,1,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,0,1,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,0,1,7850


In [24]:
train_data['price'] = pd.to_numeric(train_data['price'], errors='coerce')

In [25]:
from category_encoders import TargetEncoder

# Encode ext_col
target_encoder_ext = TargetEncoder(cols=['ext_col'])
train_data['ext_color_encoded'] = target_encoder_ext.fit_transform(train_data[['ext_col']], train_data['price'])
train_data.drop('ext_col', axis=1, inplace=True)
test_data['ext_color_encoded'] = target_encoder_ext.transform(test_data[['ext_col']])
test_data.drop('ext_col', axis=1, inplace=True)

# Encode int_col
target_encoder_int = TargetEncoder(cols=['int_col'])
train_data['int_color_encoded'] = target_encoder_int.fit_transform(train_data[['int_col']], train_data['price'])
train_data.drop('int_col', axis=1, inplace=True)
test_data['int_color_encoded'] = target_encoder_int.transform(test_data[['int_col']])
test_data.drop('int_col', axis=1, inplace=True) 

# Encode model
target_encoder_model = TargetEncoder(cols=['model'])
train_data['model_encoded'] = target_encoder_model.fit_transform(train_data[['model']], train_data['price'])
train_data.drop('model', axis=1, inplace=True)
test_data['model_encoded'] = target_encoder_model.transform(test_data[['model']])
test_data.drop('model', axis=1, inplace=True)

target_encoder_model = TargetEncoder(cols=['brand'])
train_data['brand_encoded'] = target_encoder_model.fit_transform(train_data[['brand']], train_data['price'])
train_data.drop('brand', axis=1, inplace=True)
test_data['brand_encoded'] = target_encoder_model.transform(test_data[['brand']])
test_data.drop('brand', axis=1, inplace=True)

target_encoder_model = TargetEncoder(cols=['transmission'])
train_data['transmission_encoded'] = target_encoder_model.fit_transform(train_data[['transmission']], train_data['price'])
train_data.drop('transmission', axis=1, inplace=True)
test_data['transmission_encoded'] = target_encoder_model.transform(test_data[['transmission']])
test_data.drop('transmission', axis=1, inplace=True)


In [26]:
train_data = pd.get_dummies(train_data, columns=['fuel_type'], dtype=int)
train_data.drop('id', axis=1, inplace=True)
test_data = pd.get_dummies(test_data, columns=['fuel_type'], dtype=int)
test_id = test_data['id']
test_data.drop('id', axis=1, inplace=True)

In [27]:
import re

def extract_horsepower(text):
    match = re.search(r'(\d+\.?\d*)HP', text)
    return float(match.group(1)) if match else None

def extract_engine_size(text):
    match = re.search(r'(\d+\.?\d*)L', text)
    return float(match.group(1)) if match else None

def extract_cylinders(text):
    match = re.search(r'(\w+)(?:\s+Cylinder)', text)
    return match.group(1) if match else None

# Apply these functions to create new columns
train_data['horsepower'] = train_data['engine'].apply(extract_horsepower).astype(float)
train_data['engine_size'] = train_data['engine'].apply(extract_engine_size).astype(float)
train_data['cylinders'] = train_data['engine'].apply(extract_cylinders)
test_data['horsepower'] = test_data['engine'].apply(extract_horsepower).astype(float)
test_data['engine_size'] = test_data['engine'].apply(extract_engine_size).astype(float)
test_data['cylinders'] = test_data['engine'].apply(extract_cylinders)

In [28]:
train_data.drop('engine', axis=1, inplace=True)
test_data.drop('engine', axis=1, inplace=True)

In [29]:
train_data['cylinders'] = train_data['cylinders'].replace('V6', '6')
train_data['cylinders'] = train_data['cylinders'].fillna('0')
train_data['cylinders'] = train_data['cylinders'].astype(str)
train_data['cylinders'] = pd.to_numeric(train_data['cylinders'], errors='coerce').fillna(0).astype(int)
test_data['cylinders'] = test_data['cylinders'].replace('V6', '6')
test_data['cylinders'] = test_data['cylinders'].fillna('0')
test_data['cylinders'] = test_data['cylinders'].astype(str)
test_data['cylinders'] = pd.to_numeric(test_data['cylinders'], errors='coerce').fillna(0).astype(int)

In [30]:
train_data['model_year'] = train_data['model_year'].astype(int) - 1974
test_data['model_year'] = test_data['model_year'].astype(int) - 1974

In [31]:
train_data.head()

Unnamed: 0,model_year,milage,accident,clean_title,price,ext_color_encoded,int_color_encoded,model_encoded,brand_encoded,transmission_encoded,fuel_type_Diesel,fuel_type_E85 Flex Fuel,fuel_type_Gasoline,fuel_type_Hybrid,fuel_type_Plug-In Hybrid,fuel_type_not supported,fuel_type_–,horsepower,engine_size,cylinders
0,44,74349,0,1,11000,37614.687018,26723.43746,43169.955645,38154.063227,60508.125743,0,0,1,0,0,0,0,375.0,3.5,6
1,33,80000,0,1,8250,38413.911328,41290.661314,15520.375566,40276.029448,33931.317579,0,0,1,0,0,0,0,300.0,3.0,6
2,35,91491,0,1,15000,26120.155301,27862.756473,26187.727179,34840.403933,24939.541386,0,0,1,0,0,0,0,300.0,4.2,8
3,48,2437,0,1,63500,45155.86509,49465.203662,68657.674465,40276.029448,45993.777037,0,0,0,1,0,0,0,335.0,3.0,6
4,27,111000,0,1,7850,37774.187602,41290.661314,27810.962157,17526.114591,29971.027033,0,0,1,0,0,0,0,200.0,3.8,6


In [32]:
from sklearn.preprocessing import StandardScaler
columns_to_standardize = ['milage','ext_color_encoded','int_color_encoded','model_encoded','brand_encoded','transmission_encoded','horsepower','engine_size','cylinders']
scaler = StandardScaler()
train_data[columns_to_standardize] = scaler.fit_transform(train_data[columns_to_standardize])
test_data[columns_to_standardize] = scaler.fit_transform(test_data[columns_to_standardize])

In [33]:
from sklearn.model_selection import train_test_split

# Assuming your features are in X and target variable (price) is in y
X_train = train_data.drop('price', axis=1)
y_train = train_data['price']

In [34]:
from sklearn.impute import KNNImputer

# Define the columns to impute
columns_to_impute = ['horsepower', 'engine_size']

# Create the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

# Apply the imputer to the specified columns
X_train[columns_to_impute] = knn_imputer.fit_transform(X_train[columns_to_impute])
# X_test[columns_to_impute] = knn_imputer.fit_transform(X_test[columns_to_impute])
test_data[columns_to_impute] = knn_imputer.fit_transform(test_data[columns_to_impute])


In [35]:
# X_train.isnull().sum()
X_train.isnull().sum()

model_year                  0
milage                      0
accident                    0
clean_title                 0
ext_color_encoded           0
int_color_encoded           0
model_encoded               0
brand_encoded               0
transmission_encoded        0
fuel_type_Diesel            0
fuel_type_E85 Flex Fuel     0
fuel_type_Gasoline          0
fuel_type_Hybrid            0
fuel_type_Plug-In Hybrid    0
fuel_type_not supported     0
fuel_type_–                 0
horsepower                  0
engine_size                 0
cylinders                   0
dtype: int64

In [36]:
# Import necessary libraries
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for GridSearchCV
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],  # Regularization strength (alpha) for Ridge and Lasso
    'fit_intercept': [True, False],
    'copy_X': [True, False]
}

# # GridSearchCV for Ridge Regression
# ridge = Ridge()
# ridge_grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, 
#                                  cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
# ridge_grid_search.fit(X_train, y_train)
# best_ridge = ridge_grid_search.best_estimator_

# GridSearchCV for Lasso Regression
lasso = Lasso()
lasso_grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, 
                                 cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
lasso_grid_search.fit(X_train, y_train)
best_lasso = lasso_grid_search.best_estimator_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 

In [18]:
y_pred = best_lasso.predict(test_data)

In [19]:
y_pred = pd.DataFrame(y_pred, columns=['price'])
print(y_pred)

              price
0      30385.326815
1      16534.868302
2      39313.168528
3      62694.679016
4      48607.271035
...             ...
36178  63289.045253
36179  15645.094891
36180  -9148.512591
36181  59433.126781
36182  51304.606637

[36183 rows x 1 columns]


In [110]:
y_pred = y_pred.iloc[:,0]
result = pd.DataFrame({'id': test_id, 'price': y_pred})

In [111]:
result.to_csv('submission.csv', index=False)