#### 1. Importing Libraries


In [185]:
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from datetime import date
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,mean_absolute_error

#### 2. Loading and Exploring the Data

In [186]:
df = pd.read_csv('cardekho.csv')

In [187]:
df.describe()
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [188]:
df.year.value_counts(ascending=False)
df.fuel.value_counts()
df.seller_type.value_counts() # there are 3 types of seller_type
df.transmission.value_counts() # There are 2 types of transmission

df['mileage(km/ltr/kg)'].describe()
df.seats.value_counts()

df.owner.value_counts() # There are 5 types of owners

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [189]:
df[df.isna().any(axis=1)].shape  ### 221 rows has null values (represents the 3% of the total data)

df.duplicated().sum() # 1202 duplicated rows

1202

#### 3. Data Cleaning and Preparation

In [190]:
# Dropping the name of the cars
df.drop(columns='name',inplace = True)

In [191]:
# Deleting the duplicated rows
df.drop_duplicates(inplace=True)

In [192]:
# Dropping rows with missing data
df['max_power'] = df['max_power'].replace(' ', np.nan)
df.drop(columns=['seller_type','transmission','fuel'],inplace=True)
df.dropna(inplace=True)
df.loc[:,'max_power'] = df.loc[:,'max_power'].astype(float)

In [193]:
df[['max_power']] =df[['max_power']].astype(float)

In [194]:
# Change year -> age
this_year = date.today().year
df.loc[:,'year'] = this_year - df.loc[:,'year']

#### 4. Scaling or Normalizing Data


In [195]:
# Encoding the owner column
categ = [['First Owner', 'Second Owner', 'Third Owner','Fourth & Above Owner']]

encoder = OrdinalEncoder(categories=categ,handle_unknown='use_encoded_value',unknown_value=-2)

df.loc[:,'owner'] = (encoder.fit_transform(df['owner'].values.reshape(-1,1)) + 1)
df[['owner']] = df[['owner']].astype(int)

In [203]:
df_corr = df.corr()
df_corr['selling_price']


year                 -0.427672
selling_price         1.000000
km_driven            -0.161663
owner                -0.255417
mileage(km/ltr/kg)   -0.108264
engine                0.442896
max_power             0.692409
seats                 0.158134
Name: selling_price, dtype: float64

In [204]:
df_temp = df[['engine','max_power','year','selling_price']]

#### 5. Model Building and Training


In [209]:
X = df_temp.drop(columns='selling_price')
y = df_temp['selling_price']

x_train, x_test, y_train, y_test =train_test_split(X,y,test_size=0.25,random_state=42)

In [210]:
model = LinearRegression()

model.fit(x_train,y_train)
predictions = model.predict(x_test)

#### 6. Model Evaluation

In [211]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Cálculo de las métricas
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)