In [23]:
# Updated recommendation_engine.ipynb
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import joblib

def get_user_preference(df):
    print("\nPlease enter your preferences:")
    print(f"Available companies: {df['company'].unique()[:5]}...")
    print(f"Example models: {df['model'].unique()[:5]}...")
    
    return {
        'company': input("Company (e.g., Toyota): ").strip().title(),
        'model': input("Model (e.g., Camry): ").strip().title(),
        'kilometer': float(input("Mileage (e.g., 15000): ")),
        'modelyear': int(input("Manufacture Year (e.g., 2018): ")),
        'fueltype': input("Fuel Type (Petrol/Diesel): ").strip().title(),
        'transmissiontype': input("Transmission (Automatic/Manual): ").strip().title(),
        'bodystyle': input("Body Style (Sedan/SUV): ").strip().title(),
        'warranty': input("Warranty (Yes/No): ").strip().lower() == 'yes'
    }

# Load artifacts
model = joblib.load('trained_model.joblib')
preprocessor = model.named_steps['preprocessor']
df = pd.read_csv('preprocess_used_cars.csv')
original_features = ['company', 'model', 'kilometer', 'modelyear', 
                    'fueltype', 'transmissiontype', 'car_age', 
                    'warranty', 'bodystyle']

# Process user input with data validation
user_input = get_user_preference(df)
user_df = pd.DataFrame([user_input])
user_df['car_age'] = pd.Timestamp.now().year - user_df['modelyear']

# Ensure proper data types
user_df = user_df.astype({
    'kilometer': float,
    'modelyear': int,
    'warranty': bool
})

# Align features
user_df = user_df.reindex(columns=original_features, fill_value=0)

# Transform features
user_transformed = preprocessor.transform(user_df).toarray()
cars_transformed = preprocessor.transform(df[original_features]).toarray()

# Calculate similarity
similarity_scores = cosine_similarity(user_transformed, cars_transformed).flatten()
df['similarity'] = similarity_scores

# Dynamic thresholding
threshold = np.percentile(similarity_scores, 95)  # Top 5% matches
top_cars = df[df['similarity'] >= threshold].sort_values('similarity', ascending=False).head(5)

print("\nTop Recommendations:")
if not top_cars.empty:
    print(top_cars[['company', 'model', 'modelyear', 'kilometer', 'similarity']])
else:
    print("No exact matches found. Showing closest alternatives:")
    print(df.sort_values('similarity', ascending=False).head(5)[['company', 'model', 'modelyear', 'kilometer', 'similarity']])

# Debug info
print("\nDebug Information:")
print(f"Similarity score range: {similarity_scores.min():.2f} - {similarity_scores.max():.2f}")
print(f"95th percentile threshold: {threshold:.2f}")
print(f"User input features:\n{user_df.iloc[0]}")


Please enter your preferences:
Available companies: ['MARUTI SUZUKI' 'HYUNDAI' 'TATA' 'FORD' 'MERCEDES BENZ']...
Example models: ['CELERIO(2017-2019)' 'ALTO' 'GRAND I10' 'NEXON' 'FIGO']...

Top Recommendations:
            company        model  modelyear  kilometer  similarity
1021  MERCEDES BENZ     ML CLASS       2015     114736    0.695296
346         HYUNDAI  NEW ELANTRA       2014      98000    0.424659
611            FORD     ECOSPORT       2016      50000    0.406608
569           HONDA         CITY       2016      68290    0.405323
1017            BMW           X1       2016      94711    0.401059

Debug Information:
Similarity score range: 0.07 - 0.70
95th percentile threshold: 0.33
User input features:
company                Toyota
model                   Camry
kilometer             12000.0
modelyear                2018
fueltype               Petrol
transmissiontype    Automatic
car_age                     7
warranty                False
bodystyle               Sedan
Name: 0