In [1]:
# Packages for EDA 
import pandas as pd 
import numpy as np 
import seaborn as sns 

# Data Preprocessing
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PowerTransformer

# Modeling
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
np.seterr(divide='ignore', invalid='ignore', over='ignore')


# Showing Plots inside notebook
%matplotlib inline  
sns.set(rc={'figure.figsize': [7, 14]}, font_scale=1.2) # Standard figure size for all 

In [2]:
df = pd.read_csv('train-data.csv')
df.sample(5)

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
1435,1435,Mahindra XUV500 W8 2WD,Mumbai,2012,37000,Diesel,Manual,First,16.0 kmpl,2179 CC,140 bhp,7.0,,7.75
333,333,Hyundai i20 Asta 1.2,Pune,2011,79152,Petrol,Manual,Second,18.5 kmpl,1197 CC,82.85 bhp,5.0,,3.3
5010,5010,Volkswagen Ameo 1.2 MPI Highline 16 Alloy,Mumbai,2016,16650,Petrol,Manual,First,17.0 kmpl,1198 CC,74 bhp,5.0,,5.5
4595,4595,Audi Q3 35 TDI Quattro Premium Plus,Coimbatore,2017,42366,Diesel,Automatic,First,15.73 kmpl,1968 CC,174.33 bhp,5.0,,31.93
1944,1944,Mahindra XUV500 AT W10 FWD,Mumbai,2016,41000,Diesel,Automatic,First,16.0 kmpl,2179 CC,140 bhp,7.0,,13.75


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


In [4]:
df.duplicated().sum()


0

In [5]:
# Remove duplicated row 
df.drop_duplicates(inplace=True)

In [6]:
df.drop("New_Price", axis=1, inplace=True)


In [7]:
# My solution 
from sklearn.impute import SimpleImputer

null_col = ['Mileage', 'Engine' , 'Power','Seats']

imputer = SimpleImputer(strategy='most_frequent')
df[null_col] = imputer.fit_transform(df[null_col])

In [8]:
df = df.convert_dtypes()


In [9]:
df['Seats'].replace(to_replace=0 ,value= df['Seats'].mode()[0],inplace=True)

In [10]:
df[df['Power'] == 'null bhp']


Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
76,76,Ford Fiesta 1.4 SXi TDCi,Jaipur,2008,111111,Diesel,Manual,First,17.8 kmpl,1399 CC,null bhp,5,2.0
79,79,Hyundai Santro Xing XL,Hyderabad,2005,87591,Petrol,Manual,First,0.0 kmpl,1086 CC,null bhp,5,1.3
89,89,Hyundai Santro Xing XO,Hyderabad,2007,73745,Petrol,Manual,First,17.0 kmpl,1086 CC,null bhp,5,2.1
120,120,Hyundai Santro Xing XL eRLX Euro III,Mumbai,2005,102000,Petrol,Manual,Second,17.0 kmpl,1086 CC,null bhp,5,0.85
143,143,Hyundai Santro Xing XO eRLX Euro II,Kochi,2008,80759,Petrol,Manual,Third,17.0 kmpl,1086 CC,null bhp,5,1.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5873,5873,Hyundai Santro Xing XO eRLX Euro II,Pune,2006,47200,Petrol,Manual,Second,17.0 kmpl,1086 CC,null bhp,5,1.2
5893,5893,Maruti Estilo LXI,Chennai,2008,51000,Petrol,Manual,Second,19.5 kmpl,1061 CC,null bhp,5,1.75
5925,5925,Skoda Laura Classic 1.8 TSI,Pune,2010,85000,Petrol,Manual,First,17.5 kmpl,1798 CC,null bhp,5,2.85
5943,5943,Mahindra Jeep MM 540 DP,Chennai,2002,75000,Diesel,Manual,First,0.0 kmpl,2112 CC,null bhp,6,1.7


In [11]:
indx = df[df['Power'] == 'null bhp'].index


In [12]:
df.drop(indx,axis=0,inplace=True)


In [13]:
dirty_cols = ['Mileage', 'Engine', 'Power']


In [14]:
for col in dirty_cols:
    df[col] = df[col].apply(lambda x : float(x.split()[0]))

In [15]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
3366,3366,Honda City i DTEC SV,Mumbai,2014,25000,Diesel,Manual,First,26.0,1498.0,98.6,5,6.15
5801,5801,Toyota Corolla Altis G,Delhi,2013,64000,Petrol,Manual,Second,14.53,1798.0,138.1,5,5.2
1806,1806,Hyundai EON D Lite Plus,Kolkata,2013,36000,Petrol,Manual,First,21.1,814.0,55.2,5,1.95
499,499,Honda Brio VX AT,Mumbai,2014,35000,Petrol,Automatic,First,16.5,1198.0,86.8,5,4.25
608,608,BMW 5 Series 2013-2017 530d M Sport,Coimbatore,2016,20967,Diesel,Automatic,First,14.69,2993.0,258.0,5,44.56


In [16]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5912 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         5912 non-null   Int64  
 1   Name               5912 non-null   string 
 2   Location           5912 non-null   string 
 3   Year               5912 non-null   Int64  
 4   Kilometers_Driven  5912 non-null   Int64  
 5   Fuel_Type          5912 non-null   string 
 6   Transmission       5912 non-null   string 
 7   Owner_Type         5912 non-null   string 
 8   Mileage            5912 non-null   float64
 9   Engine             5912 non-null   float64
 10  Power              5912 non-null   float64
 11  Seats              5912 non-null   Int64  
 12  Price              5912 non-null   Float64
dtypes: Float64(1), Int64(4), float64(3), string(5)
memory usage: 675.5 KB


In [17]:
df["Brand"] = df["Name"].apply(lambda x : x.split()[0])


In [18]:
df['Brand'].nunique()

30

In [19]:
df["Name"] = df["Name"].apply(lambda x : " ".join(x.split()[:2]))


In [20]:
df = df.convert_dtypes()


In [21]:
df.sample(5)


Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand
4313,4313,Maruti Wagon,Hyderabad,2016,35000,Petrol,Automatic,First,20.51,998,67.0,5,5.5,Maruti
5099,5099,Maruti Alto,Coimbatore,2012,65349,Petrol,Manual,First,20.92,998,67.1,5,2.81,Maruti
826,826,Volkswagen Polo,Ahmedabad,2013,55000,Diesel,Manual,First,22.07,1199,73.9,5,4.25,Volkswagen
297,297,Hyundai i10,Pune,2012,45275,Petrol,Manual,First,20.36,1197,78.9,5,2.8,Hyundai
3139,3139,Hyundai i10,Coimbatore,2013,32776,Petrol,Manual,First,20.36,1197,78.9,5,4.64,Hyundai


In [22]:
df.to_csv("Cleaned_Data.csv")


In [23]:
numerical_cols = ['Kilometers_Driven' , 'Mileage' , 'Engine' , 'Power','Price']

In [24]:
df_nums = df.copy()
from datasist.structdata import detect_outliers 

outliears = detect_outliers(df_nums[numerical_cols],0,df_nums[numerical_cols].columns)
df_nums.drop(outliears,inplace=True)


In [26]:
transformation = {
    "First":3,
    "Second":2,
    "Third":1,
    "Fourth & Above":0
}

In [27]:
df['Owner_Type'] = df['Owner_Type'].map(transformation)


In [28]:
Nominal_data = ['Name','Location','Fuel_Type','Transmission','Brand']
binaryencoder = ce.BinaryEncoder(cols=Nominal_data)
df = binaryencoder.fit_transform(df)

In [29]:
Numerical_data = ['Year','Kilometers_Driven','Mileage','Engine','Power','Seats','Price']

In [30]:
# power = PowerTransformer()
new_df = pd.DataFrame(PowerTransformer().fit_transform(df), columns=df.columns, index=df.index)

In [31]:
# define dataset
X, y = new_df.drop("Price",axis=1) , new_df["Price"] 

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
reg = LinearRegression().fit(X_train,(y_train))
print("R-Squered Trian",round((reg.score(X_train,  (y_train))*100),2),'%')
print("R-Squered Test",round((reg.score(X_test,  (y_test))*100),2),'%')
y_pred = reg.predict(X_test)
print(f"RMSE: {mean_squared_error(y_pred,(y_test))}")

R-Squered Trian 87.81 %
R-Squered Test 88.4 %
RMSE: 0.11028251509761501


In [33]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
poly_reg.fit(X_train)

PolynomialFeatures()

In [34]:
X_train = poly_reg.transform(X_train)
X_test = poly_reg.transform(X_test)

In [35]:
reg = LassoCV().fit(X_train,(y_train))
print("R-Squered Trian",round((reg.score(X_train,  (y_train))*100),2),'%')
print("R-Squered Test",round((reg.score(X_test,  (y_test))*100),2),'%')
y_pred = reg.predict(X_test)
print(f"RMSE: {mean_squared_error(y_pred,(y_test))}")

R-Squered Trian 94.96 %
R-Squered Test 94.43 %
RMSE: 0.05299341980334996


In [36]:
reg = RidgeCV().fit(X_train,(y_train))
print("R-Squered Trian",round((reg.score(X_train,  (y_train))*100),2),'%')
print("R-Squered Test",round((reg.score(X_test,  (y_test))*100),2),'%')
y_pred = reg.predict(X_test)
print(f"RMSE: {mean_squared_error(y_pred,(y_test))}")

R-Squered Trian 95.31 %
R-Squered Test 94.16 %
RMSE: 0.05551218083440016


In [38]:
elastic_reg = ElasticNetCV().fit(X_train,(y_train))
print("R-Squered Trian",round((reg.score(X_train,  (y_train))*100),2),'%')
print("R-Squered Test",round((reg.score(X_test,  (y_test))*100),2),'%')
y_pred = reg.predict(X_test)
print(f"RMSE: {mean_squared_error(y_pred,(y_test))}")

R-Squered Trian 94.98 %
R-Squered Test 94.43 %
RMSE: 0.0529531214374046


In [41]:
filename = "used_car_model.pkl"
import pickle

In [42]:
with open(filename, 'wb') as file:
    # Write the model to the file
    pickle.dump(elastic_reg, file)