In [1]:
#Python Program to Implement KNN ML Model on IPL Dataset
#Author Name: Himanshu Sharma
#Program Date: 25/05/2021
#Program Time: 01:28AM

#Import the required libraries for algorithm
import pandas as pd
import numpy as np
import math

def eculids_distance(testing_index,training_index):
#This method finds the eculids distance between two instances
#testing_index: testing instance index
#training_index: training instance index
    test_instance = dataset[testing_index]
    train_instance = dataset[training_index]
    overall_difference = 0
    for feature in range (0,max_features):
        current_feature_difference = (test_instance[feature] - train_instance[feature])**2
        overall_difference += current_feature_difference
    return math.sqrt(overall_difference)

def calculate_similarity(i, j, similarity):
#This method creates a dictionary that keeps track of eculids distance wrt every training instance for current testing instance 
#i: testing instance
#j: training instance
#similarity: dictinary for mapping eculid distance with unique ids
    difference = eculids_distance(i,j)
    training_inst_id = dataset[j][instance_id_index]
    similarity[difference] = training_inst_id
    return similarity

def calculate_knn(sorted_similarity, similarity):
#This method fetches top k neighbors based on similarity, then select k nearest
#...neighbours, finds the output class of each neighbour and then returns the class
#...with maximum frequency in the neighbourhood
#sorted_similarity: eculids distance of all training instances wrt current testing instance in sorted order
#similarity: eculids distance of all training instances wrt current testing instance
    #Fetch the top k neighbors from the sorted similarity
    k_neighbours = np.zeros(k)
    for i in range (0, k):
        key = sorted_similarity[i]
        instance_id = similarity[key]
        row = df.loc[df['id'] == instance_id]
        winner = ((row['winner'].values)[0])
        k_neighbours[i] = winner
    
    #Find the most common neighbour
    neighbour_frequency = {}
    for i in range (0, k):
        current_result = k_neighbours[i]
        if current_result in neighbour_frequency.keys():
            current_frequency = neighbour_frequency[current_result]
            updated_frequency = current_frequency+1
            neighbour_frequency[current_result] = updated_frequency
        else:
            neighbour_frequency[current_result] = 1
    result = max(neighbour_frequency, key = neighbour_frequency.get)
    return result

def add_to_overall_accuracy(difference):
#This method adds the difference of expected and acutal output to overall accuracy
    if difference == 0: return 1
    else: return 0

#Read the input dataset and convert it into an numpy array of integers
df = pd.read_csv(r"IPL_DATASET.csv")
dataset = (df.as_matrix()).astype(int)

#Find the number of instances in the incoming dataset
dataset_shape = dataset.shape
no_of_instances = dataset_shape[0]
no_of_attributes = dataset_shape[1]

#Divide the dataseet into testing and training based on 80% and 20% 
training_size = no_of_instances * (0.8)
testing_size = no_of_instances - training_size

#Hyperparameters for the data model
training_index = 0
testing_index = 640
end_index = no_of_instances
increment = 1
max_features = 6
instance_id_index = 7
k = 9
teams = {0:"DRW", 1:"CSK", 2:"DeC", 3:"DCa", 4:"GuL", 5:"PKi", 6:"KoT", 7:"KKR", 8:"MuI", 9:"PuW", 10:"RaR", 11:"RCB", 12:"SRH"}
testing_accuracy = 0
overall_testing_instances = 160

#Fetch the testing instances one by one
for i in range(testing_index, end_index, increment):
    #Current testing instance
    dataset_testing_instance = dataset[i]
    
    #Dictionary to store all eculidian distances wrt current testing instance
    similarity = {}
    
    #Fetch the training instances against which we have to compare testing instance
    for j in range (training_index,testing_index,increment):
        #Find the eculidian distance of current testing instance with current training
        #instance and save it in similarity dictionary for sorting in similar order later
        similarity = calculate_similarity(i, j, similarity)
            
    #Sort the eculids distances in ascending order so that similar items are in beginning
    sorted_similarity = sorted(similarity)
    
    #Fetch the top k similar neighbors and check which neighbouring class has max frequency
    predicted_result = calculate_knn(sorted_similarity, similarity)
    acutal_result = dataset[i][6]
    difference = predicted_result - acutal_result
    print(teams[dataset[i][2]], " **vs** ", teams[dataset[i][3]],"\t   ","Predicted Winner ::", teams[predicted_result], "\tActual Winner ::",teams[acutal_result])
    testing_accuracy += add_to_overall_accuracy(difference) 
 
print("Predicted ", (testing_accuracy/160)*100, "Percent Results Correctly \nUsing KNN for K = ",k," & Features = 6")
    
    
    
    

RaR  **vs**  MuI 	    Predicted Winner :: RaR 	Actual Winner :: RaR
DCa  **vs**  PKi 	    Predicted Winner :: DCa 	Actual Winner :: PKi
MuI  **vs**  SRH 	    Predicted Winner :: MuI 	Actual Winner :: SRH
RCB  **vs**  CSK 	    Predicted Winner :: CSK 	Actual Winner :: CSK
SRH  **vs**  PKi 	    Predicted Winner :: SRH 	Actual Winner :: SRH
DCa  **vs**  KKR 	    Predicted Winner :: DCa 	Actual Winner :: DCa
CSK  **vs**  MuI 	    Predicted Winner :: KKR 	Actual Winner :: MuI
RaR  **vs**  SRH 	    Predicted Winner :: RaR 	Actual Winner :: SRH
RCB  **vs**  KKR 	    Predicted Winner :: RCB 	Actual Winner :: KKR
CSK  **vs**  DCa 	    Predicted Winner :: CSK 	Actual Winner :: CSK
RCB  **vs**  MuI 	    Predicted Winner :: MuI 	Actual Winner :: RCB
DCa  **vs**  RaR 	    Predicted Winner :: DCa 	Actual Winner :: DCa
KKR  **vs**  CSK 	    Predicted Winner :: KKR 	Actual Winner :: KKR
PKi  **vs**  MuI 	    Predicted Winner :: PKi 	Actual Winner :: MuI
CSK  **vs**  RCB 	    Predicted Winner :: CSK 	A