In [1]:
# Import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import preprocessing and evaluation modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Import machine learning models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR

In [2]:
df = pd.read_csv("CarPrice_Assignment (2).csv")

In [3]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [4]:
df.tail()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.4,23.0,106,4800,26,27,22470.0
204,205,-1,volvo 264gl,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,19,25,22625.0


In [5]:
# Check for missing values
print(df.isnull().sum())

# Splitting CarName into CarBrand and Model
df[['CarBrand', 'Model']] = df['CarName'].str.split(' ', 1, expand=True)

# Drop 'CarName' column if 'CarBrand' and 'Model' are sufficient
df = df.drop(['CarName'], axis=1)


car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


  df[['CarBrand', 'Model']] = df['CarName'].str.split(' ', 1, expand=True)


In [6]:
df.head()

Unnamed: 0,car_ID,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,CarBrand,Model
0,1,3,gas,std,two,convertible,rwd,front,88.6,168.8,...,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero,giulia
1,2,3,gas,std,two,convertible,rwd,front,88.6,168.8,...,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero,stelvio
2,3,1,gas,std,two,hatchback,rwd,front,94.5,171.2,...,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero,Quadrifoglio
3,4,2,gas,std,four,sedan,fwd,front,99.8,176.6,...,3.19,3.4,10.0,102,5500,24,30,13950.0,audi,100 ls
4,5,2,gas,std,four,sedan,4wd,front,99.4,176.6,...,3.19,3.4,8.0,115,5500,18,22,17450.0,audi,100ls


In [7]:
# Standardize car brand names in the CarBrand column using inplace=True
df['CarBrand'].replace({
    'maxda': 'Mazda',
    'mazda': 'Mazda',
    'nissan': 'Nissan',
    'Nissan': 'Nissan',
    'porsche': 'Porsche',
    'porcshce': 'Porsche',
    'toyota': 'Toyota',
    'toyouta': 'Toyota',
    'vokswagen': 'Volkswagen',
    'volkswagen': 'Volkswagen',
    'vw': 'Volkswagen'
}, inplace=True)

# Verify the changes
print(df['CarBrand'].value_counts())


Toyota         32
Nissan         18
Mazda          17
mitsubishi     13
honda          13
Volkswagen     12
subaru         12
peugeot        11
volvo          11
dodge           9
buick           8
bmw             8
audi            7
plymouth        7
saab            6
Porsche         5
isuzu           4
jaguar          3
chevrolet       3
alfa-romero     3
renault         2
mercury         1
Name: CarBrand, dtype: int64


In [8]:
df.drop('Model', axis=1, inplace=True)

In [9]:
df

Unnamed: 0,car_ID,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,CarBrand
0,1,3,gas,std,two,convertible,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,2,3,gas,std,two,convertible,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,3,1,gas,std,two,hatchback,rwd,front,94.5,171.2,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,4,2,gas,std,four,sedan,fwd,front,99.8,176.6,...,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0,audi
4,5,2,gas,std,four,sedan,4wd,front,99.4,176.6,...,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0,audi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,gas,std,four,sedan,rwd,front,109.1,188.8,...,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0,volvo
201,202,-1,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0,volvo
202,203,-1,gas,std,four,sedan,rwd,front,109.1,188.8,...,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0,volvo
203,204,-1,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,idi,3.01,3.40,23.0,106,4800,26,27,22470.0,volvo


In [10]:
#X = df.drop('price', axis=1)
#y = df['price']


In [11]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
'''categorical_columns = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)

from sklearn.preprocessing import LabelEncoder

# Apply LabelEncoder to each categorical column
label_encoder = LabelEncoder()
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])'''


'categorical_columns = df.select_dtypes(include=[\'object\']).columns\nprint("Categorical columns:", categorical_columns)\n\nfrom sklearn.preprocessing import LabelEncoder\n\n# Apply LabelEncoder to each categorical column\nlabel_encoder = LabelEncoder()\nfor column in categorical_columns:\n    df[column] = label_encoder.fit_transform(df[column])'

In [12]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Dictionary to store encoding mappings for each column
encoding_mappings = {}

# Encode and print mapping for each categorical column
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])  # Apply label encoding
    encoding_mappings[column] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
    print(f"Encoding for '{column}': {encoding_mappings[column]}")

Encoding for 'fueltype': {'diesel': 0, 'gas': 1}
Encoding for 'aspiration': {'std': 0, 'turbo': 1}
Encoding for 'doornumber': {'four': 0, 'two': 1}
Encoding for 'carbody': {'convertible': 0, 'hardtop': 1, 'hatchback': 2, 'sedan': 3, 'wagon': 4}
Encoding for 'drivewheel': {'4wd': 0, 'fwd': 1, 'rwd': 2}
Encoding for 'enginelocation': {'front': 0, 'rear': 1}
Encoding for 'enginetype': {'dohc': 0, 'dohcv': 1, 'l': 2, 'ohc': 3, 'ohcf': 4, 'ohcv': 5, 'rotor': 6}
Encoding for 'cylindernumber': {'eight': 0, 'five': 1, 'four': 2, 'six': 3, 'three': 4, 'twelve': 5, 'two': 6}
Encoding for 'fuelsystem': {'1bbl': 0, '2bbl': 1, '4bbl': 2, 'idi': 3, 'mfi': 4, 'mpfi': 5, 'spdi': 6, 'spfi': 7}
Encoding for 'CarBrand': {'Mazda': 0, 'Nissan': 1, 'Porsche': 2, 'Toyota': 3, 'Volkswagen': 4, 'alfa-romero': 5, 'audi': 6, 'bmw': 7, 'buick': 8, 'chevrolet': 9, 'dodge': 10, 'honda': 11, 'isuzu': 12, 'jaguar': 13, 'mercury': 14, 'mitsubishi': 15, 'peugeot': 16, 'plymouth': 17, 'renault': 18, 'saab': 19, 'subaru'

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Split the data into training and testing sets
X = df.drop(columns=['price', 'car_ID'])  # Features
y = df['price']                           # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

# Train and evaluate each model
results = []

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)  # RMSE
    
    # Append results
    results.append({
        'Model': name,
        'R² (Train)': r2_train,
        'R² (Test)': r2_test,
        'RMSE (Test)': rmse_test
    })

# Convert results to a DataFrame for better visualization
import pandas as pd
results_df = pd.DataFrame(results)

# Sort by R² (Test) for better readability
results_df = results_df.sort_values(by='R² (Test)', ascending=False)
print(results_df)


                       Model  R² (Train)  R² (Test)  RMSE (Test)
5              Random Forest    0.984630   0.960539  1764.985607
8                    XGBoost    0.998653   0.934663  2271.118066
6          Gradient Boosting    0.993805   0.927110  2398.804392
7                   AdaBoost    0.954228   0.916569  2566.400781
4              Decision Tree    0.998654   0.893762  2896.005444
9                   LightGBM    0.966443   0.866711  3243.818497
3      ElasticNet Regression    0.860664   0.819755  3772.168382
1           Ridge Regression    0.894902   0.805429  3919.215833
2           Lasso Regression    0.897959   0.801507  3958.509669
0          Linear Regression    0.897987   0.799362  3979.843164
10  Support Vector Regressor   -0.110890  -0.101955  9326.990695
