## CODEALPHA PROJECT

### CAR PRICE PREDICTION

The price of a car depends on a lot of factors like the goodwill of the brand of the car, features of the car, horsepower and the mileage it gives and many more. Car price prediction is one of the major research areas in machine learning.

In [4]:
# Import our libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from ydata_profiling import ProfileReport

import warnings                                          # To remove warnings from my output
warnings.simplefilter(action = 'ignore')  

#### DATA EXPLORATION

In [6]:
# Import our dataset

filepath = '/Users/imetal/Documents/Me/e-Learning/Projects/CodeAlpha/car data.csv'

df = pd.read_csv(filepath)

In [7]:
# Display the first few rows of our dataset

df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [8]:
# Determine the size of our data

df.shape

(301, 9)

In [9]:
# Determine data type of features

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [10]:
# Count of unique values

df.nunique()

Car_Name          98
Year              16
Selling_Price    156
Present_Price    148
Driven_kms       206
Fuel_Type          3
Selling_type       2
Transmission       2
Owner              3
dtype: int64

In [11]:
# Count of duplicates

df.duplicated().sum()

2

In [12]:
# Drop the duplicate rows

df = df.drop_duplicates()

df.shape

(299, 9)

In [13]:
# Summary statistics of numeric columns

df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner
count,299.0,299.0,299.0,299.0,299.0
mean,2013.615385,4.589632,7.541037,36916.752508,0.043478
std,2.896868,4.98424,8.566332,39015.170352,0.24872
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.85,1.2,15000.0,0.0
50%,2014.0,3.51,6.1,32000.0,0.0
75%,2016.0,6.0,9.84,48883.5,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [14]:
# Generate a Pandas Profiling Report to uncover deeper insights into the dataset.

profile = ProfileReport(df, title = 'Summary of Car Data')
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

#### DATA ENCODING

In [16]:
# Converting categorical columns to numeric representations using LabelEncoder

# Initialize the LabelEncoder

label_encoder = LabelEncoder()

In [17]:
# Apply label encoding to each categorical column

for column in ['Car_Name', 'Fuel_Type', 'Selling_type', 'Transmission']:
    df[column] = label_encoder.fit_transform(df[column])
    
print('\nDataFrame after Label Encoding:')
print(df.head())


DataFrame after Label Encoding:
   Car_Name  Year  Selling_Price  Present_Price  Driven_kms  Fuel_Type  \
0        90  2014           3.35           5.59       27000          2   
1        93  2013           4.75           9.54       43000          1   
2        68  2017           7.25           9.85        6900          2   
3        96  2011           2.85           4.15        5200          2   
4        92  2014           4.60           6.87       42450          1   

   Selling_type  Transmission  Owner  
0             0             1      0  
1             0             1      0  
2             0             1      0  
3             0             1      0  
4             0             1      0  


#### MODEL PREPARATION

In [19]:
# Define features and target

X = df[['Car_Name', 'Year', 'Selling_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type', 'Transmission']]   # X is our feature variable
y = df[['Present_Price']]      # y is our target variable


In [20]:
# Scaling our data

scaler = StandardScaler()                           # This creates a scaler instance
X_scaled = scaler.fit_transform(X)

X_scaled

array([[ 1.07634401,  0.13299198, -0.24912732, ...,  0.49184668,
        -0.74109567,  0.38729833],
       [ 1.1936136 , -0.21278716,  0.03222894, ..., -1.88012358,
        -0.74109567,  0.38729833],
       [ 0.21636698,  1.17032939,  0.53465083, ...,  0.49184668,
        -0.74109567,  0.38729833],
       ...,
       [ 0.25545684, -1.59590371, -0.24912732, ...,  0.49184668,
        -0.74109567,  0.38729833],
       [ 0.25545684,  1.17032939,  1.38876804, ..., -1.88012358,
        -0.74109567,  0.38729833],
       [ 0.13818725,  0.82455025,  0.14276175, ...,  0.49184668,
        -0.74109567,  0.38729833]])

In [21]:
# Convert the features_scaled from an array to a DataFrame

cols = ['Car_Name', 'Year', 'Selling_Price', 'Driven_kms', 'Fuel_Type', 'Selling_type', 'Transmission']
X_scaled = pd.DataFrame(X_scaled, columns = cols)

X_scaled.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Driven_kms,Fuel_Type,Selling_type,Transmission
0,1.076344,0.132992,-0.249127,-0.254603,0.491847,-0.741096,0.387298
1,1.193614,-0.212787,0.032229,0.156181,-1.880124,-0.741096,0.387298
2,0.216367,1.170329,0.534651,-0.770651,0.491847,-0.741096,0.387298
3,1.310883,-0.904345,-0.349612,-0.814297,0.491847,-0.741096,0.387298
4,1.154524,0.132992,0.002084,0.142061,-1.880124,-0.741096,0.387298


#### TRAINING A MACHINE LEARNING MODEL

In [23]:
# Splitting the data into training and testing set

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [24]:
# Creating our linear regression model

model = LinearRegression()

In [25]:
# Training our model with the training dataset

model.fit(X_train, y_train) 

In [26]:
# Evaluating the Model

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 22.113757650307623
R2 Score: 0.5952509677071003


#### MAKING PREDICTIONS

In [28]:
# Prediction 1

new_features = np.array([[70, 2017, 7.8, 48000, 2, 2, 1]])
# Example of new data to predict.

new_features_scaled = scaler.transform(new_features)
predicted_price = model.predict(new_features_scaled)

predicted_value = predicted_price.item()  # Extract the single numerical value

print(f"Predicted Price: {predicted_value:.2f}")

Predicted Price: 13.26
