#### 1. Importing Libraries


In [None]:
from sklearn.linear_model import LinearRegression
import pandas as pd
from datetime import date

#### 2. Loading and Exploring the Data

In [None]:
df = pd.read_csv('cardekho.csv')

In [None]:
df.describe()
df.head()

In [None]:
df.year.value_counts(ascending=False)
df.fuel.value_counts()
df.seller_type.value_counts() # there are 3 types of seller_type
df.transmission.value_counts() # There are 2 types of transmission

df['mileage(km/ltr/kg)'].describe()
df.seats.value_counts()

df.owner.value_counts() # There are 5 types of owners

In [None]:
df[df.isna().any(axis=1)].shape  ### 221 rows has null values (represents the 3% of the total data)

df.duplicated().sum() # 1202 duplicated rows

#### 3. Data Cleaning and Preparation

In [None]:
# Dropping the name of the cars
df.drop(columns='name',inplace = True)

In [None]:
# Deleting the duplicated rows
df.drop_duplicates(inplace=True)

In [None]:
# Dropping rows with missing data
df.dropna(inplace=True)

In [None]:
# Change year -> age
this_year = date.today().year
df.loc[:,'year'] = this_year - df.loc[:,'year']

#### 4. Scaling or Normalizing Data


In [None]:
# Encoding the owner column
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

categ = [['First Owner', 'Second Owner', 'Third Owner','Fourth & Above Owner']]

encoder = OrdinalEncoder(categories=categ,handle_unknown='use_encoded_value',unknown_value=-2)

df.loc[:,'owner'] = encoder.fit_transform(df['owner'].values.reshape(-1,1)) + 1

#### 5. Model Building and Training


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix

In [None]:
X = df.drop(columns='selling_price')
y = df['selling_price']

x_train, x_test, y_train, y_test =train_test_split(X,y,test_size=0.25)

In [None]:
df_corr = df.corr

In [None]:
df_corr

In [None]:
import seaborn as sns

sns.heatmap(df_corr)

In [None]:
model = LinearRegression()

model.fit(x_train,y_train)

#### 6. Model Evaluation