In [1]:
# importing the required libraries
import pandas as pd
import numpy as np

In [15]:
# Load the dataset from the provided sample.csv file
file_path = "sample.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')
print(df)

                                    name  year  selling_price  km_driven  \
0                 Maruti Swift Dzire VDI  2014         450000     145500   
1           Skoda Rapid 1.5 TDI Ambition  2014         370000     120000   
2               Honda City 2017-2020 EXi  2006         158000     140000   
3              Hyundai i20 Sportz Diesel  2010         225000     127000   
4                 Maruti Swift VXI BSIII  2007         130000     120000   
5          Hyundai Xcent 1.2 VTVT E Plus  2017         440000      45000   
6           Maruti Wagon R LXI DUO BSIII  2007          96000     175000   
7                     Maruti 800 DX BSII  2001          45000       5000   
8                       Toyota Etios VXD  2011         350000      90000   
9   Ford Figo Diesel Celebration Edition  2013         200000     169000   
10       Renault Duster 110PS Diesel RxL  2014         500000      68000   
11                         Maruti Zen LX  2005          92000     100000   
12          

In [18]:
### 1. Handling Missing Values ###
# Fill missing numerical values with median to avoid bias from extreme values
df['mileage(km/ltr/kg)'].fillna(df['mileage(km/ltr/kg)'].median())
df['engine'].fillna(df['engine'].median())
df['seats'].fillna(df['seats'].median())

0     5.0
1     5.0
2     5.0
3     5.0
4     5.0
5     5.0
6     5.0
7     4.0
8     5.0
9     5.0
10    5.0
11    5.0
12    5.0
13    5.0
14    5.0
15    5.0
16    7.0
17    5.0
18    5.0
Name: seats, dtype: float64

In [19]:
# Convert 'max_power' from object to numeric, coercing errors to NaN if any non-numeric values exist
df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')

In [21]:
# Fill missing values in 'max_power' with the median value
df['max_power'].fillna(df['max_power'].median())

0      74.00
1     103.52
2      78.00
3      90.00
4      88.20
5      81.86
6      57.50
7      37.00
8      67.10
9      68.10
10    108.45
11     60.00
12     73.90
13     76.00
14     67.00
15     82.00
16     88.50
17     90.00
18     46.30
Name: max_power, dtype: float64

In [22]:
### 2. Removing Duplicates ###
# Drop duplicate rows to ensure data consistency
df.drop_duplicates(inplace=True)

In [23]:
### 3. Handling Outliers using the IQR Method ###
def remove_outliers(df, column):
    """
    Removes outliers from the given column using the Interquartile Range (IQR) method.
    Any value outside 1.5 * IQR from Q1 and Q3 is considered an outlier.
    """
    Q1 = df[column].quantile(0.25)  # First quartile (25th percentile)
    Q3 = df[column].quantile(0.75)  # Third quartile (75th percentile)
    IQR = Q3 - Q1  # Interquartile range
    lower_bound = Q1 - 1.5 * IQR  # Lower bound for outliers
    upper_bound = Q3 + 1.5 * IQR  # Upper bound for outliers
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]  # Remove outliers

# Apply outlier removal to numerical columns
for col in ['selling_price', 'km_driven', 'mileage(km/ltr/kg)', 'engine', 'max_power']:
    df = remove_outliers(df, col)

### 4. Encoding Categorical Data ###
# Convert categorical columns into numeric format using One-Hot Encoding
# 'drop_first=True' avoids dummy variable trap by removing one category per feature
df = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission', 'owner'], drop_first=True)

In [24]:
print(df)

                                    name  year  selling_price  km_driven  \
0                 Maruti Swift Dzire VDI  2014         450000     145500   
1           Skoda Rapid 1.5 TDI Ambition  2014         370000     120000   
2               Honda City 2017-2020 EXi  2006         158000     140000   
3              Hyundai i20 Sportz Diesel  2010         225000     127000   
4                 Maruti Swift VXI BSIII  2007         130000     120000   
5          Hyundai Xcent 1.2 VTVT E Plus  2017         440000      45000   
6           Maruti Wagon R LXI DUO BSIII  2007          96000     175000   
7                     Maruti 800 DX BSII  2001          45000       5000   
8                       Toyota Etios VXD  2011         350000      90000   
9   Ford Figo Diesel Celebration Edition  2013         200000     169000   
10       Renault Duster 110PS Diesel RxL  2014         500000      68000   
11                         Maruti Zen LX  2005          92000     100000   
12          

In [27]:
### 5. Save Processed Dataset ###
# Save the cleaned and preprocessed dataset as a CSV file
df.to_csv("C:/Users/Admin/Desktop/tybsc D98/Cars-Recommendation-System/processed_sample.csv", index=False)

print("✅ Data preprocessing completed. Cleaned dataset saved as 'processed_Cardekho_Dataset.csv'.")

✅ Data preprocessing completed. Cleaned dataset saved as 'processed_Cardekho_Dataset.csv'.
