In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Reading the file

In [None]:
df=pd.read_csv("../input/vehicle-dataset-from-cardekho/CAR DETAILS FROM CAR DEKHO.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

### Checking for null values

In [None]:
df.isnull().sum()

In [None]:
df.describe()

#### Creating a new column to find out how old is the car from current year and the year car was bought

In [None]:
df["Current Year"]=2022
df.head()

In [None]:
df["num_year"]=df["Current Year"]-df["year"]
df.head()

In [None]:
df.drop(["year"],axis=1,inplace=True)
df.drop(["Current Year"],axis=1,inplace=True)
df.head()

#### Some basic EDA

In [None]:
df["fuel"].value_counts()

In [None]:
df["seller_type"].value_counts()

In [None]:
df["transmission"].value_counts()

In [None]:
df["owner"].value_counts()

In [None]:
sns.countplot(x="fuel",data=df)

In [None]:
sns.countplot(x="seller_type",data=df)

In [None]:
sns.countplot(x="transmission",data=df)

In [None]:
sns.countplot(x="owner",data=df)

#### Converting categorical values to numeric values

In [None]:
df.replace({'fuel':{'Petrol':0,'Diesel':1,'CNG':2,'LPG':3,'Electric':4}},inplace=True)
df.replace({'seller_type':{'Dealer':0,'Individual':1,'Trustmark Dealer':2}},inplace=True)
df.replace({'transmission':{'Manual':0,'Automatic':1}},inplace=True)
df.replace({'owner':{'First Owner':0,'Second Owner':1,'Third Owner':2,'Fourth & Above Owner':3,'Test Drive Car':4}},inplace=True)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
fig, axes = plt.subplots(2, 2, figsize = (16,10))
fig.suptitle('Visuallization of Numerical columns')

sns.regplot(x = 'num_year', y = 'selling_price', data = df, ax = axes[0,0])
sns.regplot(x = 'seller_type', y = 'selling_price', data = df, ax = axes[0,1])
sns.regplot(x = 'km_driven', y = 'selling_price', data = df, ax = axes[1,0])
sns.regplot(x = 'owner', y = 'selling_price', data = df, ax = axes[1,1])

### Analysis from the above charts
###### Year: Selling price is deceasing with increasing number of years
##### Seller: Dealer gets higher price than Individuals or Trustmark dealers
##### Km_driven: Selling price is decreasing with increasing  Kms driven by the car
##### Owner: First owner cars fetch higher prices than others

In [None]:
plt.figure(figsize = (16,7))
sns.heatmap(df.corr(), annot = True)
plt.title('Correlation between the columns in the dataframe')
plt.show()

## Importing and applying Linear Regression

In [None]:
X = df.drop(['name','selling_price',],axis=1)
Y = df['selling_price']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state=2)

In [None]:
lin_reg_model = LinearRegression()

In [None]:
lin_reg_model.fit(X_train,Y_train)

In [None]:
training_data_prediction = lin_reg_model.predict(X_train)

In [None]:
error_score = metrics.r2_score(Y_train, training_data_prediction)
print("R squared Error : ", error_score)

In [None]:
plt.scatter(Y_train, training_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title(" Actual Prices vs Predicted Prices")
plt.show()

In [None]:
test_data_prediction = lin_reg_model.predict(X_test)

In [None]:
error_score = metrics.r2_score(Y_test, test_data_prediction)
print("R squared Error : ", error_score)

In [None]:
plt.scatter(Y_test, test_data_prediction)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title(" Actual Prices vs Predicted Prices")
plt.show()