In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Step 2: Load Dataset
# Load the dataset
data = pd.read_csv('../data/car_data.csv')

# Display the first 5 rows
print(data.head())

# Basic info about the dataset
print(data.info())

# Check for missing values
print(data.isnull().sum())

In [None]:
# Step 3: Exploratory Data Analysis (EDA)
# Check basic statistics
print(data.describe())

# Visualizing correlations
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Distribution of car prices
sns.histplot(data['Selling_Price'], kde=True, bins=30)
plt.title("Distribution of Selling Price")
plt.show()

# Relationship between Selling Price and Year
sns.scatterplot(x='Year', y='Selling_Price', data=data)
plt.title("Selling Price vs Year")
plt.show()

# Analyze Fuel Type distribution
sns.countplot(data['Fuel_Type'])
plt.title("Fuel Type Distribution")
plt.show()

In [None]:
# Step 4: Data Cleaning
# Drop unnecessary columns (e.g., 'Car_Name')
data = data.drop(['Car_Name'], axis=1)

# Encoding categorical variables (Fuel_Type, Seller_Type, Transmission)
encoder = LabelEncoder()
data['Fuel_Type'] = encoder.fit_transform(data['Fuel_Type'])
data['Seller_Type'] = encoder.fit_transform(data['Seller_Type'])
data['Transmission'] = encoder.fit_transform(data['Transmission'])

In [None]:
# Step 5: Feature Engineering
# Create a new column for car age
data['Car_Age'] = 2024 - data['Year']
data = data.drop(['Year'], axis=1)

In [None]:
# Step 6: Data Splitting
# Features (X) and target (y)
X = data.drop(['Selling_Price'], axis=1)
y = data['Selling_Price']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 7: Model Building
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict and evaluate Linear Regression
lr_predictions = lr_model.predict(X_test_scaled)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))
lr_r2 = r2_score(y_test, lr_predictions)
print(f"Linear Regression - RMSE: {lr_rmse:.2f}, R2 Score: {lr_r2:.2f}")

# Train a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate Random Forest
rf_predictions = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
rf_r2 = r2_score(y_test, rf_predictions)
print(f"Random Forest - RMSE: {rf_rmse:.2f}, R2 Score: {rf_r2:.2f}")

In [None]:
# Step 8: Conclusions
print("\nConclusions:")
print("- Linear Regression performs well with RMSE and R2 Score as shown above.")
print("- Random Forest outperforms Linear Regression, suggesting it captures non-linear relationships better.")