In [20]:
import import_ipynb
import classification_component
from classification_component import train_classification_component
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
# 1. Obtain the Trained Classification Pipeline and Candidate Data
# This function train the classification component and returns:
#   best _lr: the best tuned Logistic Regression pipeline
#   X: the candidate freatures DataFrame (all the raw features used for recommendation)
best_lr, X = train_classification_component("preprocess_used_cars.csv")

# Extract the fitted preprocessor from the pipeline
preprocessor_fitted = best_lr.named_steps['preprocessor']

# Transform the candidate car feature into the same feature space
candidate_feature = preprocessor_fitted.transform(X)



In [22]:
# 1. Stimulate User Preferences
# Define s dictionary representing a user's preference.
# Ensure that the keys exactly match the feature names in your original X
# Function to capture user preference via command-line input
def get_user_preference():
    company = input("Enter Company Name (eg., TOYOTA): ").strip()
    fuelType = input("Enter the Fuel type (eg., Petrol): ").strip()
    transmission = input("Enter the Transmission type (eg., Manual): ").strip()
    carType = input("Enter the Car Type (eg., Sedan): ").strip()
    price = float(input("Enter your target price: "))
    kilometer = float(input("Enter your desired mileage: "))
    warranty = input("Do you prefer a car with warranty? (yes/no): ").strip().lower()
    warranty = True if warranty == "yes" else False
    qualityscore = float(input("Enter a desired quality score (eg., 7.5): "))

    # Create a user dictionary with the user preferences
    return {
        'company': company,
        'fuelType': fuelType,
        'transmission': transmission,
        'carType': carType,
        'price': price,
        'kilometer': kilometer,
        'warranty': warranty,
        'qualityscore': qualityscore
    }

# 2. Capture User Preference and create Dataframe
user_pref = get_user_preference()
user_df = pd.DataFrame([user_pref])
print("\nUser Preference DataFrame: ")
print(user_df)

# Ensure user_df contains all columns expected by the preprocessor
# X is our candidate DataFrame used during training, do it contains all columns.
# We need to add any columns from X that are missing in user_df
for col in X.columns:
    if col not in user_df.columns:
        # Check the type of the column in X
        if pd.api.types.is_numeric_dtype(X[col]):
            # For numeric columns, fill with NaN (or you might choose the median value)
            user_df[col] = np.nan
        else:
            # For non-numerical columns, fill with "Unknown" (or any appropriate default)
            user_df[col] = "Unknown"

# Reorder columns in user_df to match the order in X
user_df = user_df[X.columns]

print("\n User preferences DataFrame after add missing columns: ")
print(user_df)




User Preference DataFrame: 
  company fuelType transmission carType      price  kilometer  warranty  \
0     BMW   petrol       manual   sedan  1000000.0    12000.0      True   

   qualityscore  
0           7.5  

 User preferences DataFrame after add missing columns: 
   id company    model  variant fueltype   colour  kilometer manufacturedate  \
0 NaN     BMW  Unknown  Unknown  Unknown  Unknown    12000.0         Unknown   

   modelyear      price  ... transmissiontype_m  transmissiontype_manual  \
0        NaN  1000000.0  ...                NaN                  Unknown   

   transmissiontype_manual  transmissiontype_unknown  transmissiontype_manual  \
0                  Unknown                       NaN                  Unknown   

   cngkit_company fitted  cngkit_unknown  owner_2nd owner  owner_3rd owner  \
0                    NaN             NaN              NaN              NaN   

  owner_4th owner  
0             NaN  

[1 rows x 38 columns]


In [23]:
# 3. Transform User Input into Feature Space
# Use the same preprocessor from the trained pipeline to transform the user input
user_freatures = preprocessor_fitted.transform(user_df)



In [24]:
# Compute cosine similarity between the user and each candidate car vector
similarity_scores = cosine_similarity(user_freatures, candidate_feature).flatten()

In [25]:
# Rank candidate based on similarity (highest similarity first)
sorted_indicies = np.argsort(similarity_scores)[::-1]
top_n = 5
top_recommendations = X.iloc[sorted_indicies[:top_n]].copy()
top_recommendations['similarity_score'] = similarity_scores[sorted_indicies[:top_n]]

print("\nTop Recommendation based on your preferences: ")
print(top_recommendations)


Top Recommendation based on your preferences: 
         id        company                 model              variant  \
817  559249       MAHINDRA                XUV500                   W8   
618  556196  MERCEDES BENZ               C CLASS    200 K ELEGANCE AT   
318  558239         TOYOTA  FORTUNER (2012_2016)           3.0 4X2 AT   
166  557826          SKODA                 RAPID  1.5 TDI CR AMBITION   
981  551865        HYUNDAI                   I20     SPORTZ 1.2 BS IV   

    fueltype        colour  kilometer manufacturedate  modelyear  price  ...  \
817   DIESEL        Silver   2.677717      01-02-2013       2013    NaN  ...   
618   PETROL  Arizona Grey   2.677717      01-01-2010       2010    NaN  ...   
318   DIESEL         White   2.677717      01-04-2015       2015    NaN  ...   
166   DIESEL        Silver   2.677717      01-10-2016       2016    NaN  ...   
981   PETROL        G. Red   2.677717      01-01-2010       2010    NaN  ...   

     transmissiontype_manual  tr