In [None]:
import pandas as pd

# Load the dataset
file_path = 'dubai_cars_dataset.csv'
cars_data = pd.read_csv(file_path)

# Display the first few rows and data info
print(cars_data.head())
print(cars_data.info())

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Drop less relevant columns
cars_data.drop(['address', 'country', 'city', 'area_name', 'location_name'], axis=1, inplace=True)

# Fill missing values with median for numerical columns and mode for categorical
for column in cars_data.columns:
    if cars_data[column].dtype == 'object':
        # Using mode for categorical data
        mode_value = cars_data[column].mode()[0]
        cars_data[column] = cars_data[column].fillna(mode_value)
    else:
        # Using median for numerical data
        median_value = cars_data[column].median()
        cars_data[column] = cars_data[column].fillna(median_value)

# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Select categorical data
categorical_data = cars_data.select_dtypes(include=['object'])

# Fit and transform categorical data
encoded_columns = encoder.fit_transform(categorical_data).toarray()
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_data.columns))

# Drop original categorical columns and concatenate encoded columns
cars_data.drop(columns=categorical_data.columns, inplace=True)
cars_data = pd.concat([cars_data, encoded_df], axis=1)

# Splitting the dataset into features and target variable
X = cars_data.drop('price', axis=1)
y = cars_data['price']

# Splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print head to verify
X_train.head()

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)

# Fit the model to the training data
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)


In [None]:
# Calculate R^2 and MSE for the test set
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R² Score: {r2:.2f}')


In [None]:
# Adding cross validation

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Initialize the Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)

# Assuming the dataset is ready and X, y are defined
# If X and y are not defined, you need to split your dataset appropriately
# X = cars_data.drop('price', axis=1)
# y = cars_data['price']

# Perform 10-fold cross-validation
scores = cross_val_score(regressor, X, y, cv=10, scoring='r2')

# Calculate the average R² score across all folds
average_r2 = np.mean(scores)

# Display the results
print("R² scores for each fold:", scores)
print("Average R² score:", average_r2)
