In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rd
import math as mt

In [2]:
# load data

df = pd.read_csv("CARS_1.csv")

In [3]:
# clean and normalize data

def normalize(df, columnName):
    maxVal = df[columnName].max()
    df[columnName] = df[columnName].divide(maxVal)
    
def mapStrToInt(df, columnName):
    uniqueStrs = df[columnName].unique()
    strToIntDict = dict([(y, x + 1) for x, y in enumerate(uniqueStrs)])
    for index, row in df.iterrows():
        df.loc[index, columnName] = strToIntDict[row[columnName]]
    
df.drop(columns = ["reviews_count", "rating", "starting_price", "ending_price"], inplace = True)
df.dropna(inplace = True)
df.reset_index(drop = True, inplace = True)
strColumns = ["fuel_type", "transmission_type", "body_type"]

for columnName in strColumns:
    if columnName != "car_name":
        mapStrToInt(df, columnName)
        df = df.astype({columnName:'int'})
    

for columnName in df.columns:
    if columnName != "car_name":
        normalize(df, columnName)
        

In [4]:
# k means algorithm

def euclidDistance(xFTyp, xEngDisp, xNoCyl, xSeatCap, xTranTyp, xFTnkCap, xBType, xTrqNM, xTrqRPM, xPwrBHP, xPwrRP,
                   yFTyp, yEngDisp, yNoCyl, ySeatCap, yTranTyp, yFTnkCap, yBType, yTrqNM, yTrqRPM, yPwrBHP, yPwrRP):
    
    return np.sqrt((xFTyp - yFTyp) ** 2 + (xEngDisp - yEngDisp) ** 2 + (xNoCyl - yNoCyl) ** 2 + (xSeatCap - ySeatCap) ** 2 + (xTranTyp - yTranTyp) ** 2 +
                   (xFTnkCap - yFTnkCap) ** 2 + (xBType - yBType) ** 2 + (xTrqNM - yTrqNM) ** 2 + (xTrqRPM - yTrqRPM) ** 2 + (xPwrBHP - yPwrBHP) ** 2 + 
                   (xPwrRP - yPwrRP) ** 2)
    
def kmeans(df, k, tol):
    reconError = 100
    oldError = 0
    differences = [0] * k
    totalDifferences = 0
    centroids = [0] * k
    clusters = [-1] * len(df)
    randomNums = rd.sample(range(0, len(df)), k)
    
    for i, num in enumerate(randomNums):
        centroids[i] = df.iloc[num]
    
    for index in range(len(df)):
        for j in range(k):
            differences[j] = euclidDistance(df.iloc[index]["fuel_type"], df.iloc[index]["engine_displacement"], df.iloc[index]["no_cylinder"], df.iloc[index]["seating_capacity"],
                                            df.iloc[index]["transmission_type"], df.iloc[index]["fuel_tank_capacity"], df.iloc[index]["body_type"], df.iloc[index]["max_torque_nm"],
                                            df.iloc[index]["max_torque_rpm"], df.iloc[index]["max_power_bhp"], df.iloc[index]["max_power_rp"], 
                                            centroids[j]["fuel_type"], centroids[j]["engine_displacement"], centroids[j]["no_cylinder"], centroids[j]["seating_capacity"],
                                            centroids[j]["transmission_type"], centroids[j]["fuel_tank_capacity"], centroids[j]["body_type"], centroids[j]["max_torque_nm"],
                                            centroids[j]["max_torque_rpm"], centroids[j]["max_power_bhp"], centroids[j]["max_power_rp"])
        totalDifferences += min(differences)
        clusters[index] = differences.index(min(differences)) # get the index associated with the lowest euclid distance, this index is the centroid this node is assigned to
    oldError = (totalDifferences) / len(df) # this code is confirmed working

    # while reconError > tol:
    #     centroids = np.zeros((11, ) * k)
    #     differences = [0] * k
    #     totalDifferences = [0] * len(df)
    #     for a in range(k):
    #         averageFTyp = 0
    #         averageEngDisp = 0
    #         averageNoCyl = 0
    #         averageSeatCap = 0
    #         averageTranTyp = 0
    #         averageFTypCap = 0
    #         averageBTyp = 0
    #         averageTrqNM = 0
    #         averageTrqRPM = 0
    #         averagePwrBHP = 0
    #         averagePwrRP = 0
    #         length = 0
    #         for b in range(len(df)):
    #             if clusters[b] == a:
    #                 averageFTyp += df.iloc[b]["fuel_type"]
    #                 averageEngDisp += df.iloc[b]["engine_displacement"]
    #                 averageNoCyl += df.iloc[b]["no_cylinder"]
    #                 averageSeatCap += df.iloc[b]["seating_capacity"]
    #                 averageTranTyp += df.iloc[b]["transmission_type"]
    #                 averageFTypCap += df.iloc[b]["fuel_tank_capacity"]
    #                 averageBTyp += df.iloc[b]["body_type"]
    #                 averageTrqNM += df.iloc[b]["max_torque_nm"]
    #                 averageTrqRPM += df.iloc[b]["max_torque_rpm"]
    #                 averagePwrBHP += df.iloc[b]["max_power_bhp"]
    #                 averagePwrRP += df.iloc[b]["max_power_rp"]
    #                 length += 1
    #         centroids[a] = [(averageFTyp / length), (averageEngDisp) / length), (averageNoCyl / length), (averageSeatCap) / length), (averageTranTyp / length), (averageFTypCap) / length), (averageBTyp / length), (averageTrqNM) / length), (averageTrqRPM / length), (averagePwrBHP) / length), (averagePwrRP / length)]


0.6631676392067062


0