In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.neighbors import NearestNeighbors


In [2]:
file_path = "carr.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

In [5]:
df = df.dropna(subset=['price'])

# Encode categorical variables
label_encoders = {}
categorical_cols = ['car_name','brand', 'model', 'fuel_type', 'trans_type', 'body_type']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [6]:
df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

# Fill missing values with the median of numerical columns
df.fillna(df.select_dtypes(include=np.number).median(), inplace=True)

# Define features and target variable
X = df[['car_name', 'brand', 'model', 'fuel_type', 'trans_type', 'engine_power', 'body_type', 'mileage', 'no_of_cylinders', 'seats']]
y = df['price']


In [7]:
# import re

# # Function to extract numeric values from engine_power
# def extract_numeric(value):
#     if isinstance(value, str):
#         num = re.findall(r'\d+', value)  # Extract numeric part
#         return int(num[0]) if num else np.nan
#     return value

# # Apply function to engine_power column
# df['engine_power'] = df['engine_power'].apply(extract_numeric)

# # Ensure all features are numeric before scaling
# X = df[['car_namebrand', 'model', 'fuel_type', 'trans_type', 'engine_power', 
#         'body_type', 'mileage', 'no_of_cylinders', 'seats']]

# X = X.apply(pd.to_numeric, errors='coerce')

# # Fill any remaining NaN values with median
# X.fillna(X.median(), inplace=True)

# Apply StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


In [9]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"R2 Score: {r2}")


Mean Absolute Error: 723329.0781818181
R2 Score: 0.35734972511165786


In [10]:
nn_model = NearestNeighbors(n_neighbors=5, metric='euclidean')
nn_model.fit(X_scaled)

In [14]:
car_name_mapping = {
    1: "Toyota Corolla",
    2: "Honda Civic",
    # Add more mappings as needed
}

brand_mapping = {
    1: "Toyota",
    2: "Honda",
    # Add more mappings as needed
}

def recommend_car(input_car_index):
    distances, indices = nn_model.kneighbors([X_scaled[input_car_index]])
    print("Recommended Cars:")
    for i in indices[0][1:]:  # Exclude the input car itself
        car_info = df.iloc[i][['car_name', 'brand', 'model', 'price', 'mileage']]
        
        # If car_name and brand are numeric, map them to their actual names
        if isinstance(car_info['car_name'], (int, float)):
            car_info['car_name'] = car_name_mapping.get(car_info['car_name'], 'Unknown')
        if isinstance(car_info['brand'], (int, float)):
            car_info['brand'] = brand_mapping.get(car_info['brand'], 'Unknown')
        
        print(car_info)

# Example usage
recommend_car(0)

Recommended Cars:
car_name        43
brand           22
model           45
price       690000
mileage      19.33
Name: 1, dtype: object
car_name        43
brand           22
model           48
price       780000
mileage      19.33
Name: 4, dtype: object
car_name        47
brand           22
model           44
price       500000
mileage       20.9
Name: 33, dtype: object
car_name        43
brand           22
model           50
price       720000
mileage      19.33
Name: 2, dtype: object
