In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'dubai_cars_dataset.csv'
cars_data = pd.read_csv(file_path)

# Display the first few rows and data info
print(cars_data.head())
print(cars_data.info())

# Drop less relevant columns
cars_data.drop(['address', 'country', 'city', 'area_name', 'location_name'], axis=1, inplace=True)

# Fill missing values with median for numerical columns and mode for categorical
for column in cars_data.columns:
    if cars_data[column].dtype == 'object':
        # Using mode for categorical data
        mode_value = cars_data[column].mode()[0]
        cars_data[column] = cars_data[column].fillna(mode_value)
    else:
        # Using median for numerical data
        median_value = cars_data[column].median()
        cars_data[column] = cars_data[column].fillna(median_value)

def convert_hp_range(hp_string):
    if '-' in hp_string:
        low, high = hp_string.split('-')
        low = int(low.strip().split(' ')[0])
        high = int(high.strip().split(' ')[0])
        return (low + high) / 2
    else:
        # Remove any characters that are not digits or decimals and check if empty
        hp_clean = ''.join(filter(str.isdigit, hp_string.split(' ')[0]))
        return float(hp_clean) if hp_clean else np.nan

# Clean 'horsepower'
cars_data['horsepower'] = cars_data['horsepower'].apply(convert_hp_range)

# Assuming 'engine_capacity_cc' might also have similar issues, apply a generic cleaning
def clean_numeric(column_value):
    # Remove non-numeric characters and check if empty
    cleaned_value = ''.join(filter(str.isdigit, str(column_value).split(' ')[0]))
    return float(cleaned_value) if cleaned_value else np.nan

# Clean 'engine_capacity_cc'
cars_data['engine_capacity_cc'] = cars_data['engine_capacity_cc'].apply(clean_numeric)

# If there are NaNs introduced by cleaning, you can fill them with the median of the column
cars_data['horsepower'] = cars_data['horsepower'].fillna(cars_data['horsepower'].median())
cars_data['engine_capacity_cc'] = cars_data['engine_capacity_cc'].fillna(cars_data['engine_capacity_cc'].median())

# Verify the changes and proceed
print(cars_data[['horsepower', 'engine_capacity_cc']].head())

# Polynomial features for horsepower and engine capacity
poly = PolynomialFeatures(degree=2, include_bias=False)
important_features = cars_data[['horsepower', 'engine_capacity_cc']]  # assuming conversion to numeric already done
poly_features = poly.fit_transform(important_features)
poly_features_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(['horsepower', 'engine_capacity_cc']))

# Add polynomial features to the dataframe
cars_data = pd.concat([cars_data, poly_features_df], axis=1)

# Binning 'year' and 'kilometers'
binning = KBinsDiscretizer(n_bins=5, encode='onehot-dense', strategy='quantile')
binned_columns = cars_data[['year', 'kilometers']]
binned_features = binning.fit_transform(binned_columns)
binned_features_df = pd.DataFrame(binned_features, columns=['year_bin' + str(i) for i in range(5)] + ['kilometers_bin' + str(i) for i in range(5)])

# Add binned features to the dataframe
cars_data = pd.concat([cars_data, binned_features_df], axis=1)

# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Select categorical data
categorical_data = cars_data.select_dtypes(include=['object'])

# Fit and transform categorical data
encoded_columns = encoder.fit_transform(categorical_data).toarray()
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_data.columns))

# Drop original categorical columns and concatenate encoded columns
cars_data.drop(columns=categorical_data.columns, inplace=True)
cars_data = pd.concat([cars_data, encoded_df], axis=1)

# Now, let's prepare the dataset again for training
X = cars_data.drop('price', axis=1)
y = cars_data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Regressor
#regressor = DecisionTreeRegressor(random_state=42)

# Re-fit the model with new features

#regressor.fit(X_train, y_train)
#y_pred = regressor.predict(X_test)

# Evaluate the model again
#mse = mean_squared_error(y_test, y_pred)
#r2 = r2_score(y_test, y_pred)

#print(f'Mean Squared Error: {mse:.2f}')
#print(f'R² Score: {r2:.2f}')

# Adding cross validation

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

# Initialize the Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=42)

# Assuming the dataset is ready and X, y are defined
# If X and y are not defined, you need to split your dataset appropriately
# X = cars_data.drop('price', axis=1)
# y = cars_data['price']

# Perform 10-fold cross-validation
scores = cross_val_score(regressor, X, y, cv=10, scoring='r2')

# Calculate the average R² score across all folds
average_r2 = np.mean(scores)

# Display the results
print("R² scores for each fold:", scores)
print("Average R² score:", average_r2)
