In [111]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random as rd
import math as mt

In [112]:
# load data

df = pd.read_csv("CARS_1.csv")

In [113]:
# clean and normalize data

def normalize(df, columnName):
    maxVal = df[columnName].max()
    df[columnName] = df[columnName].divide(maxVal)
    
def mapStrToInt(df, columnName):
    uniqueStrs = df[columnName].unique()
    strToIntDict = dict([(y, x + 1) for x, y in enumerate(uniqueStrs)])
    for index, row in df.iterrows():
        df.loc[index, columnName] = strToIntDict[row[columnName]]
    
df.drop(columns = ["reviews_count", "rating", "starting_price", "ending_price"], inplace = True)
df.dropna(inplace = True)
df.reset_index(drop = True, inplace = True)
strColumns = ["fuel_type", "transmission_type", "body_type"]

for columnName in strColumns:
    if columnName != "car_name":
        mapStrToInt(df, columnName)
        df = df.astype({columnName:'int'})
    

for columnName in df.columns:
    if columnName != "car_name":
        normalize(df, columnName)
        

In [114]:
# k means algorithm

def euclidDistance(xFTyp, xEngDisp, xNoCyl, xSeatCap, xTranTyp, xFTnkCap, xBType, xTrqNM, xTrqRPM, xPwrBHP, xPwrRP,
                   yFTyp, yEngDisp, yNoCyl, ySeatCap, yTranTyp, yFTnkCap, yBType, yTrqNM, yTrqRPM, yPwrBHP, yPwrRP):
    
    return np.sqrt((xFTyp - yFTyp) ** 2 + (xEngDisp - yEngDisp) ** 2 + (xNoCyl - yNoCyl) ** 2 + (xSeatCap - ySeatCap) ** 2 + (xTranTyp - yTranTyp) ** 2 +
                   (xFTnkCap - yFTnkCap) ** 2 + (xBType - yBType) ** 2 + (xTrqNM - yTrqNM) ** 2 + (xTrqRPM - yTrqRPM) ** 2 + (xPwrBHP - yPwrBHP) ** 2 + 
                   (xPwrRP - yPwrRP) ** 2)
    
def kmeans(df, k, tol):
    reconError = 100
    oldError = 0
    differences = [0] * k
    totalDifferences = [0] * len(df)
    centroids = [0] * k
    clusters = [-1] * len(df)
    randomNums = rd.sample(range(0, len(df)), k)
    
    for i, num in enumerate(randomNums):
        centroids[i] = df.iloc[num]
    
    for index in range(len(df)):
        for j in range(k):
            differences[j] = euclidDistance(df.iloc[index]["fuel_type"], df.iloc[index]["engine_displacement"], df.iloc[index]["no_cylinder"], df.iloc[index]["seating_capacity"],
                                            df.iloc[index]["transmission_type"], df.iloc[index]["fuel_tank_capacity"], df.iloc[index]["body_type"], df.iloc[index]["max_torque_nm"],
                                            df.iloc[index]["max_torque_rpm"], df.iloc[index]["max_power_bhp"], df.iloc[index]["max_power_rp"], 
                                            centroids[j]["fuel_type"], centroids[j]["engine_displacement"], centroids[j]["no_cylinder"], centroids[j]["seating_capacity"],
                                            centroids[j]["transmission_type"], centroids[j]["fuel_tank_capacity"], centroids[j]["body_type"], centroids[j]["max_torque_nm"],
                                            centroids[j]["max_torque_rpm"], centroids[j]["max_power_bhp"], centroids[j]["max_power_rp"])
        totalDifferences[index] = min(differences)
        clusters[index] = differences.index(totalDifferences[index]) # get the index associated with the lowest euclid distance, this index is the centroid this node is assigned to
    oldError = (sum(totalDifferences)) / len(totalDifferences) # this code is confirmed working

    while reconError > tol:
        centroids = np.zeros((11, ) * k)
        differences = [0] * k
        totalDifferences = [0] * len(df)
        for a in range(k):
            averageFTyp = []
            averageEngDisp = []
            averageNoCyl = []
            averageSeatCap = []
            averageTranTyp = []
            averageFTypCap = []
            averageBTyp = []
            averageTrqNM = []
            averageTrqRPM = []
            averagePwrBHP = []
            averagePwrRP = []
            for b in range(len(df)):
                if clusters[b] == a:
                    averageFTyp.append(df.iloc[b]["fuel_type"])
                    averageEngDisp.append(df.iloc[b]["engine_displacement"])
                    averageNoCyl.append(df.iloc[b]["no_cylinder"])
                    averageSeatCap.append(df.iloc[b]["seating_capacity"])
                    averageTranTyp.append(df.iloc[b]["transmission_type"])
                    averageFTypCap.append(df.iloc[b]["fuel_tank_capacity"])
                    averageBTyp.append(df.iloc[b]["body_type"])
                    averageTrqNM.append(df.iloc[b]["max_torque_nm"])
                    averageTrqRPM.append(df.iloc[b]["max_torque_rpm"])
                    averagePwrBHP.append(df.iloc[b]["max_power_bhp"])
                    averagePwrRP.append(df.iloc[b]["max_power_rp"])
            centroids[a] = [(sum(averageFTyp) / len(averageFTyp)), (sum(averageEngDisp) / len(averageEngDisp)), (sum(averageNoCyl) / len(averageNoCyl)), (sum(averageSeatCap) / len(averageSeatCap)),
                            (sum(averageTranTyp) / len(averageTranTyp)), (sum(averageFTypCap) / len(averageFTypCap)), (sum(averageBTyp) / len(averageBTyp)), (sum(averageTrqNM) / len(averageTrqNM)), 
                            (sum(averageTrqRPM) / len(averageTrqRPM)), (sum(averagePwrBHP) / len(averagePwrBHP)), (sum(averagePwrRP) / len(averagePwrRP))]


0.6244657224827654


0