# Car Recommender

## Importing Packages and Data

### Installing Packages

In [3]:
!pip install hvplot
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

Collecting hvplot
  Downloading hvplot-0.8.4-py2.py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: hvplot
Successfully installed hvplot-0.8.4


### Colab Import

In [4]:
from google.colab import files
uploaded = files.upload()

Saving vehicles_clean.csv to vehicles_clean.csv


In [5]:
import io
cars_df = pd.read_csv(io.BytesIO(uploaded['vehicles_clean.csv']))
# Dataset is now stored in a Pandas Dataframe

### SQLite3 Import

In [None]:
# Import necessary libraries
import sqlite3

In [None]:
# Connect to the SQLite database
# conn = sqlite3.connect('used_cars.db')

In [None]:
# query = "SELECT * FROM used_cars"
# cars_df = pd.read_sql(query, conn)

### Import Using Pandas CSV

In [None]:
# Load the data into a Pandas DataFrame
cars_df = pd.read_csv(
    "data/vehicles_clean.csv")

# Display sample data
cars_df.head(10)

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state,posting_date
0,99990,2018,nissan,like new,6 cylinders,gas,1234.0,clean,automatic,4wd,full-size,coupe,white,hi,2021-05-03
1,99888,1940,ford,excellent,8 cylinders,gas,46113.0,clean,automatic,rwd,full-size,sedan,silver,ca,2021-05-01
2,99700,2018,ford,excellent,8 cylinders,diesel,31484.0,clean,automatic,4wd,full-size,pickup,grey,ca,2021-05-01
3,98995,2017,porsche,excellent,8 cylinders,gas,38562.0,clean,automatic,4wd,full-size,hatchback,brown,ak,2021-05-03
4,98900,2001,ferrari,good,8 cylinders,gas,20187.0,clean,automatic,rwd,mid-size,convertible,red,ga,2021-05-01
5,98900,2001,ferrari,good,8 cylinders,gas,20187.0,clean,automatic,rwd,mid-size,convertible,red,sc,2021-05-01
6,98900,2001,ferrari,good,8 cylinders,gas,20187.0,clean,automatic,rwd,mid-size,convertible,red,fl,2021-05-01
7,98900,2001,ferrari,good,8 cylinders,gas,20187.0,clean,automatic,rwd,mid-size,convertible,red,al,2021-05-01
8,98900,2001,ferrari,good,8 cylinders,gas,20187.0,clean,automatic,rwd,mid-size,convertible,red,tx,2021-05-01
9,98750,2019,ford,like new,8 cylinders,diesel,47100.0,clean,automatic,4wd,full-size,truck,black,ak,2021-05-02


In [None]:
cars_df.info()

## Filter System

In [6]:
# Starting with the recommender via filtering first.
# Designating weights for features (WE CAN ADJUST THESE as we see fit)
weights = {
    'price': 3,
    'year': 3,
    'manufacturer': 1,
    'condition': 2,
    'odometer': 2,
    'size': 2,
    'type': 2,
    'paint_color': 2
}

In [7]:
# Determine "distance" from the user's selected features. Cars with the smallest
# distance to the user's selections are recommended.
# WE NEED TO DETERMINE FINAL SELECTION OPTIONS
# ['price', 'year', 'manufacturer', 'condition', 'odometer', 'size', 'type', 'paint_color']

def recommend_cars(user_choices, state, top_n=5):
    # Filter by state first
    df_state = cars_df[cars_df['state'] == state]

    distance = pd.Series([0] * len(df_state), index=df_state.index)

    for feature, value in user_choices.items():
        if feature in ['price', 'year', 'odometer']:
            print(f"Processing numeric feature: {feature} with value: {value}")
            distance += weights[feature] * abs(df_state[feature] - value)
        else:
            print(f"Processing non-numeric feature: {feature} with value: {value}")
            distance += weights[feature] * (df_state[feature] != value).astype(int)

    # Get the indices of the cars with the smallest "distances"
    recommended_indices = distance.nsmallest(top_n).index

    return df_state.loc[recommended_indices]


In [8]:
# Example of user selections
# user_input = {
#     'price': 15000,
#     'year': 2015,
#     'manufacturer': 'ford',
#     'condition': 'excellent',
#     'odometer': 100000,
#     'size': 'mid-size',
#     'type': 'sedan'
# }


user_input = {
    'price': 15000,
    'year': 2015,
    'odometer': 100000
}
state = 'ca'

print(recommend_cars(user_input, state))


Processing numeric feature: price with value: 15000
Processing numeric feature: year with value: 2015
Processing numeric feature: odometer with value: 100000
       price  year manufacturer  condition    cylinders fuel  odometer  \
18220  15000  2005        dodge       good  6 cylinders  gas  100090.0   
19294  14900  2006      lincoln  excellent  8 cylinders  gas  100300.0   
19313  14900  2006      lincoln       good  8 cylinders  gas  100300.0   
19348  14900  2006      lincoln  excellent  8 cylinders  gas  100500.0   
17943  15500  1950      mercury       good  8 cylinders  gas  100000.0   

      title_status transmission drive       size      type paint_color state  \
18220        clean    automatic   fwd   mid-size  mini-van        blue    ca   
19294        clean    automatic   4wd  full-size    pickup       black    ca   
19313        clean    automatic   4wd  full-size    pickup       black    ca   
19348        clean    automatic   4wd  full-size    pickup       black    ca 

## Determining K

### Elbow Curve

In [None]:
# Find the best value for k
# Create a list with the number of k-values from 1 to 11
k = list(range(1, 11))

In [None]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `scaled_cars_df`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(scaled_cars_df)
    inertia.append(k_model.inertia_)

In [None]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow = pd.DataFrame(elbow_data)

In [None]:
# Plot a line chart with all the inertia values computed with
# the different values of k to visually identify the optimal value for k.
import holoviews as hv

hv.extension('matplotlib')

og_elbow = df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

og_elbow

## Recommendation System

### Cluster with K-Means & PCA

In [None]:
cars_df = pd.read_csv(
    "data/vehicles_clean.csv")

# nominal encoder
top_manufacturers = cars_df['manufacturer'].value_counts()[cars_df['manufacturer'].value_counts() > 500].index.values
cars_df = cars_df[cars_df['manufacturer'].isin(top_manufacturers)].copy()
# combine dummy variables with DataFrame
cars_df = pd.concat([cars_df,pd.get_dummies(cars_df['manufacturer'], dtype=float)], axis=1)

# convert categorical data appropriately for sklearn
cars_df['condition'] = cars_df['condition'].map({'salvage':0,
                          'fair':1,
                          'good':2,
                          'excellent':3,
                          'like new':4,
                          'new':5})

cars_df = cars_df[cars_df['cylinders'] != 'other']
cars_df['cylinders'] = cars_df['cylinders'].map({'3 cylinders':0,
                                                 '4 cylinders':1,
                                                 '5 cylinders':2,
                                                 '6 cylinders':3,
                                                 '8 cylinders':4,
                                                 '10 cylinders':5,
                                                 '12 cylinders':6})

# combine dummy variables with DataFrame
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['fuel'], dtype=float)], axis=1)
# convert nominal categorical data
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['title_status'], dtype=float)], axis=1)

# transmission value other not useful, drop it
cars_df = cars_df[cars_df['transmission'] != 'other'].copy()
# convert nominal categorical data
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['transmission'], dtype=float)], axis=1)

# convert nominal categorical data
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['drive'], dtype=float)], axis=1)

#encode size to be numeric
cars_df['size'] = cars_df['size'].map({'sub-compact':0, 'compact':1, 'mid-size':2, 'full-size':3})

# convert only the types with values counts > 400
type_cars = ['sedan', 'SUV', 'truck', 'pickup', 'coupe', 'hatchback', 'van', 'convertible', 'mini-van', 'wagon']
cars_df = cars_df[cars_df['type'].isin(type_cars)].copy()
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['type'], dtype=float)], axis=1)

regr_cars_df = cars_df.drop(columns=['manufacturer', 'fuel', 'title_status', 'type', 'paint_color', 'state', 'posting_date', 'transmission', 'drive']).copy()
regr_cars_df.head()

Unnamed: 0,price,year,condition,cylinders,odometer,size,acura,audi,bmw,buick,...,SUV,convertible,coupe,hatchback,mini-van,pickup,sedan,truck,van,wagon
0,99990,2018,4,3,1234.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,99888,1940,3,4,46113.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,99700,2018,3,4,31484.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,98750,2019,4,4,47100.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,98500,1970,3,4,27000.0,2,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Scaling price and odometer data
columns_to_scale = ['price', 'odometer']
features = regr_cars_df[columns_to_scale]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

regr_cars_df[columns_to_scale] = scaled_features
regr_cars_df

Unnamed: 0,price,year,condition,cylinders,odometer,size,acura,audi,bmw,buick,...,SUV,convertible,coupe,hatchback,mini-van,pickup,sedan,truck,van,wagon
0,7.437537,2018,4,3,-0.584212,3,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.428787,1940,3,4,-0.377098,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,7.412660,2018,3,4,-0.444610,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,7.331163,2019,4,4,-0.372543,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,7.309717,1970,3,4,-0.465303,2,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62009,-1.097259,2001,1,3,0.803809,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
62010,-1.097259,1999,0,3,0.130026,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
62011,-1.097259,1994,2,4,-0.585292,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
62012,-1.097259,2007,1,4,0.172954,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=40)

In [None]:
# Fit the K-Means model using the scaled data
model.fit(regr_cars_df)

In [None]:
# Predict the clusters to group the clusters using the scaled data
clusters = model.predict(regr_cars_df)

# Print the resulting array of cluster values.
print(clusters)

In [None]:
# Create a copy of the DataFrame
scaled_cars_df_cluster = regr_cars_df.copy()

In [None]:
# Add a new column to the DataFrame with the predicted clusters
scaled_cars_df_cluster["clusters"] = clusters


# Display sample data
scaled_cars_df_cluster.head()

### Recommendation System Based On Cosine Similarity

In [None]:
! pip install nltk

# Importing necessary libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import string
import random
from PIL import Image
import requests
from io import BytesIO
#import io
import matplotlib.pyplot as plt
%matplotlib inline

#reading the file
cars_df = pd.read_csv(
    "data/vehicles_clean.csv")

In [None]:
cars_df.head()

In [None]:
# Function for recommending cars based on state
def recommend(made,color_group,type_group,price_range):

    # Matching the type with the dataset and reset the index
    data = df.loc[(df['color_group']==color_group)
                  & (df['type_group']==type_group) & ((df['price']>=price_range[0]) & (df['price']<=price_range[1]))]
    data.reset_index(level = 0, inplace = True)

    # Convert the index into series
    indices = pd.Series(data.index, index = data['Made'])

    #Converting the car manufacturer country into vectors and used unigram
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df = 1, stop_words='english')
    tfidf_matrix = tf.fit_transform(data['Made'])

    # Calculating the similarity measures based on Cosine Similarity
    sg = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Get the index corresponding to original_manufacturer
    idx = indices[made]
# Get the pairwsie similarity scores
    sig = list(enumerate(sg[idx]))
# Sort the cars
    sig = sorted(sig, reverse=True)
# Scores of the 6 most similar cars
    sig = sig[0:6]
# car indicies
    movie_indices = [i[0] for i in sig]

    # Top 6 car recommendations
    rec = data[['price','Made','manufacturer', 'model','type','year','Age','condition','fuel','title_status'
                ,'transmission','paint_color','state']].iloc[movie_indices]
    return rec

    # if the image urls were still in place,
    # the images of the recommended cars would have been printed with this

   # for i in rec['image_url']:
        #response = requests.get(i,stream=True)
       # img = Image.open(io.BytesIO(response.content))
        #plt.figure()
        #print(plt.imshow(img))

In [None]:
from collections import Counter
import math

def knn(data, query, k, distance_fn, choice_fn):
    neighbor_distances_and_indices = []

    # 3. For each example in the data
    for index, example in enumerate(data):
        # 3.1 Calculate the distance between the query example and the current
        # example from the data.
        distance = distance_fn(example[:-1], query)

        # 3.2 Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))

    # 4. Sort the ordered collection of distances and indices from
    # smallest to largest (in ascending order) by the distances
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)

    # 5. Pick the first K entries from the sorted collection
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]

    # 6. Get the labels of the selected K entries
    k_nearest_labels = [data[i][-1] for distance, i in k_nearest_distances_and_indices]

    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

def mean(labels):
    return sum(labels) / len(labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]

def euclidean_distance(point1, point2):
    sum_squared_distance = 0
    for i in range(len(point1)):
        sum_squared_distance += math.pow(point1[i] - point2[i], 2)
    return math.sqrt(sum_squared_distance)

def main():
    '''
    # Regression Data
    #
    # Column 0: height (inches)
    # Column 1: weight (pounds)
    '''
    reg_data = [
       [65.75, 112.99],
       [71.52, 136.49],
       [69.40, 153.03],
       [68.22, 142.34],
       [67.79, 144.30],
       [68.70, 123.30],
       [69.80, 141.49],
       [70.01, 136.46],
       [67.90, 112.37],
       [66.49, 127.45],
    ]

    # Question:
    # Given the data we have, what's the best-guess at someone's weight if they are 60 inches tall?
    reg_query = [60]
    reg_k_nearest_neighbors, reg_prediction = knn(
        reg_data, reg_query, k=3, distance_fn=euclidean_distance, choice_fn=mean
    )

    '''
    # Classification Data
    #
    # Column 0: age
    # Column 1: likes pineapple
    '''
    clf_data = [
       [22, 1],
       [23, 1],
       [21, 1],
       [18, 1],
       [19, 1],
       [25, 0],
       [27, 0],
       [29, 0],
       [31, 0],
       [45, 0],
    ]
    # Question:
    # Given the data we have, does a 33 year old like pineapples on their pizza?
    clf_query = [33]
    clf_k_nearest_neighbors, clf_prediction = knn(
        clf_data, clf_query, k=3, distance_fn=euclidean_distance, choice_fn=mode
    )

if __name__ == '__main__':
    main()

### Clustering with K-Nearest Neighbor

In [10]:
# cars_df = pd.read_csv(
#     "data/vehicles_clean.csv")

# nominal encoder
top_manufacturers = cars_df['manufacturer'].value_counts()[cars_df['manufacturer'].value_counts() > 500].index.values
cars_df = cars_df[cars_df['manufacturer'].isin(top_manufacturers)].copy()
# combine dummy variables with DataFrame
cars_df = pd.concat([cars_df,pd.get_dummies(cars_df['manufacturer'], dtype=float)], axis=1)

# convert categorical data appropriately for sklearn
cars_df['condition'] = cars_df['condition'].map({'salvage':0,
                          'fair':1,
                          'good':2,
                          'excellent':3,
                          'like new':4,
                          'new':5})

cars_df = cars_df[cars_df['cylinders'] != 'other']
cars_df['cylinders'] = cars_df['cylinders'].map({'3 cylinders':0,
                                                 '4 cylinders':1,
                                                 '5 cylinders':2,
                                                 '6 cylinders':3,
                                                 '8 cylinders':4,
                                                 '10 cylinders':5,
                                                 '12 cylinders':6})

# combine dummy variables with DataFrame
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['fuel'], dtype=float)], axis=1)
# convert nominal categorical data
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['title_status'], dtype=float)], axis=1)

# transmission value other not useful, drop it
cars_df = cars_df[cars_df['transmission'] != 'other'].copy()
# convert nominal categorical data
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['transmission'], dtype=float)], axis=1)

# convert nominal categorical data
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['drive'], dtype=float)], axis=1)

#encode size to be numeric
cars_df['size'] = cars_df['size'].map({'sub-compact':0, 'compact':1, 'mid-size':2, 'full-size':3})

# convert only the types with values counts > 400
type_cars = ['sedan', 'SUV', 'truck', 'pickup', 'coupe', 'hatchback', 'van', 'convertible', 'mini-van', 'wagon']
cars_df = cars_df[cars_df['type'].isin(type_cars)].copy()
cars_df = pd.concat([cars_df, pd.get_dummies(cars_df['type'], dtype=float)], axis=1)

regr_cars_df = cars_df.drop(columns=['manufacturer', 'fuel', 'title_status', 'type', 'paint_color', 'state', 'posting_date', 'transmission', 'drive']).copy()
regr_cars_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars_df['cylinders'] = cars_df['cylinders'].map({'3 cylinders':0,


Unnamed: 0,price,year,condition,cylinders,odometer,size,acura,audi,bmw,buick,...,SUV,convertible,coupe,hatchback,mini-van,pickup,sedan,truck,van,wagon
0,99990,2018,4,3,1234.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,99888,1940,3,4,46113.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,99700,2018,3,4,31484.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,98750,2019,4,4,47100.0,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,98500,1970,3,4,27000.0,2,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Scaling price and odometer data
columns_to_scale = ['price', 'odometer']
features = regr_cars_df[columns_to_scale]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

regr_cars_df[columns_to_scale] = scaled_features
regr_cars_df

Unnamed: 0,price,year,condition,cylinders,odometer,size,acura,audi,bmw,buick,...,SUV,convertible,coupe,hatchback,mini-van,pickup,sedan,truck,van,wagon
0,7.437537,2018,4,3,-0.584212,3,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7.428787,1940,3,4,-0.377098,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,7.412660,2018,3,4,-0.444610,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,7.331163,2019,4,4,-0.372543,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,7.309717,1970,3,4,-0.465303,2,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62009,-1.097259,2001,1,3,0.803809,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
62010,-1.097259,1999,0,3,0.130026,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
62011,-1.097259,1994,2,4,-0.585292,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
62012,-1.097259,2007,1,4,0.172954,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# Import K-Nearest Neighbors
# from sklearn.neighbors import NearestNeighbors
# model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=20, n_jobs=-1)


In [13]:
# Fit the model
model_knn.fit(regr_cars_df)

# Get recommendations for a specific car
car_index = 100  # replace with the index of the car you want to get recommendations for
distances, indices = model_knn.kneighbors(regr_cars_df.iloc[car_index].values.reshape(1, -1), n_neighbors=4)

# Print out the indices of the recommended cars
print(indices)

[[100 101  99  97]]




In [14]:
# Display recommended cars
recommended_cars = cars_df.iloc[indices[0]]

# Exclude the input car from the recommendations
recommended_cars = recommended_cars[1:]
print(recommended_cars)

     price  year manufacturer  condition  cylinders    fuel  odometer  \
129  75850  2018         ford          3          4  diesel   43145.0   
127  75850  2020         ford          3          4  diesel   41735.0   
125  75850  2018         ford          3          4  diesel   25900.0   

    title_status transmission drive  ...  SUV convertible coupe hatchback  \
129        clean    automatic   4wd  ...  0.0         0.0   0.0       0.0   
127        clean    automatic   4wd  ...  0.0         0.0   0.0       0.0   
125        clean    automatic   4wd  ...  0.0         0.0   0.0       0.0   

    mini-van  pickup  sedan  truck  van  wagon  
129      0.0     1.0    0.0    0.0  0.0    0.0  
127      0.0     1.0    0.0    0.0  0.0    0.0  
125      0.0     1.0    0.0    0.0  0.0    0.0  

[3 rows x 66 columns]
