In [None]:
import pandas as pd

# Load the dataset
file_path = 'C:\Projects_genAI\Datasets\dubai_cars_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataframe
print(data.head())

# Show a summary of the dataframe
print(data.info())


In [None]:
## Data Processing

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Selecting features and target
features = ['kilometers', 'year', 'vehicle_age_years', 'doors', 'seating_capacity', 'no_of_cylinders', 'brand', 'fuel_type', 'transmission_type']
X = data[features]
y = data['price']

# Handling categorical and numerical features
categorical_features = ['brand', 'fuel_type', 'transmission_type']
numerical_features = ['kilometers', 'year', 'vehicle_age_years', 'doors', 'seating_capacity', 'no_of_cylinders']

# Create transformers for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Scale features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
])

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np

# Pipeline for linear regression
lr_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', LinearRegression())])

# Pipeline for Ridge with cross-validation
ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', RidgeCV(alphas=np.logspace(-6, 6, 13), cv=5))])

# Further simplification of the Elastic Net and Lasso configurations
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LassoCV(alphas=np.logspace(-2, 2, 3), cv=2, max_iter=3000))
])

elastic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ElasticNetCV(alphas=np.logspace(-2, 2, 3), l1_ratio=[0.5, 0.9], cv=2, max_iter=3000))
])


# Fit and evaluate models again with simplified parameters
models = {'Linear Regression': lr_pipeline, 'Ridge': ridge_pipeline, 'Lasso': lasso_pipeline, 'ElasticNet': elastic_pipeline}
r2_scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_scores[name] = r2_score(y_test, y_pred)

# Re-plot the results
plt.figure(figsize=(10, 6))
plt.bar(r2_scores.keys(), r2_scores.values())
plt.xlabel('Model')
plt.ylabel('R-squared Score')
plt.title('Comparison of Regression Models')
plt.show()

print(r2_scores)
