### Loading and cleaning data

In [1]:
# Required imports
import pandas as pd
import math

pd.set_option('display.max_columns',30)

# Load data
dataset_df1 = pd.read_csv(r"./premonsoon/ground_water_quality_2018_pre.csv")
dataset_df2 = pd.read_csv(r"./premonsoon/ground_water_quality_2019_pre.csv")
dataset_df3 = pd.read_csv(r"./premonsoon/ground_water_quality_2020_pre.csv")
dataset_df4 = pd.read_csv(r"./premonsoon/ground_water_quality_2021_pre.csv")
dataset_df5 = pd.read_csv(r"./premonsoon/ground_water_quality_2022_pre.csv")
dataset_df6 = pd.read_csv(r"./postmonsoon/ground_water_quality_2018_post.csv")
dataset_df7 = pd.read_csv(r"./postmonsoon/ground_water_quality_2019_post.csv")
dataset_df8 = pd.read_csv(r"./postmonsoon/ground_water_quality_2020_post.csv")
dataset_df9 = pd.read_csv(r"./postmonsoon/ground_water_quality_2021_post.csv")
dataset_df = pd.concat([dataset_df1,dataset_df2,dataset_df3,dataset_df4,dataset_df5,dataset_df6,dataset_df7,dataset_df8,dataset_df9],join='outer',ignore_index=True)
dataset_df.drop(columns=['RL_GIS','sno','village','mandal','district','gwl'],inplace=True)
dataset_df['season']=dataset_df['season'].astype(str).apply(lambda x: 0 if 'pre' in x.lower() else 1)
# End load data

# Changing fields to numeric equivalents.
dataset_df['pH'] = pd.to_numeric(dataset_df['pH'], errors='coerce')
# Now convert to float
dataset_df['pH'] = dataset_df['pH'].astype(float)
dataset_df['RSC  meq  / L'] = pd.to_numeric(dataset_df['RSC  meq  / L'], errors='coerce')
# Now convert to float
dataset_df['RSC  meq  / L'] = dataset_df['RSC  meq  / L'].astype(float)
# End changing fields

# Remove Null values
for index, row in dataset_df.iterrows():
    if 'NA' in row.values:
        dataset_df.drop(index, inplace=True)
    elif(row['Classification']=='OG')or(row['Classification']=='O.G')or(row['Classification']=='BELOW THE GRAPH')or(row['Classification']=='OUT OF SAR GRAPH')or(row['Classification']=='BG'):
        dataset_df.drop(index, inplace=True)
dataset_df.reset_index(drop=True, inplace=True)


null_threshold = len(dataset_df) * 0.25
for column in dataset_df.columns:
    null_count = dataset_df[column].isnull().sum()
    if null_count > null_threshold:
        dataset_df.drop(column, axis=1, inplace=True)
    elif null_count > 0:
        dataset_df.dropna(subset=[column], inplace=True)
dataset_df.reset_index(drop=True, inplace=True)

data = dataset_df
# End remove null

### KNN (K Nearest Neighbours) Algorithm

The K nearest neighbours algorithm is a non-parametric supervised learning algorithm.  
Its **training** phase only consists of storing the data.  
Its **classification** phase involves of calculating the K nearest neighbours (according to some "distance" metric) to the new data for which classification is sought and then finding representative label of those K neighbours and assigning the new data this label. The K mentioned here is given as a parameter and is not learnt, though there are thumb rules (such as sqrt(n), n = # datapoints) and some procedures which can make better choices for K.  

<img src="kNN_Explanation.png"/>

#### Notes regarding implementation:  
There are other kinds of representation of data which help in finding nearest neighbour more quickly than the naive method, which we do not consider here for implementation.  
We implement the naive method, which implies that distance of new point from all points is first calculated,   
and we then determine the K neighbours, closest to this point, and then simply find the label which occurs maximum number of times  
(we have some other rules such as requiring a majority vote, which is defined to be a situation where the majority has to exceed  
 all others by the proportion of number of classes).   
The K chosen in our implementation is sqrt(n), where n is the number of datapoints. Also the metric used is the usual Euclidean Norm/ L2 Norm.    
Also, due to non-parallelized implementation, the function is extremely slow and might take around 5-15 minutes to give the results.  
The accuracy obtained, however is similar/comparable with the standard library packages implementation. (See below)

<img src="kNN_Algorithm_Pseudocode.png"/>

### KNN Implementation without sklearn

In [2]:
# Euclidean metric/L2 norm metric
def calculate_distance(row : pd.Series,trow : pd.Series) -> float:
    dist = 0
    for index,value in row.items():
        if index != 'Classification' and index != 'Classification.1':
            dist += (value-trow[index])**2
    return math.sqrt(dist)

# Return the label with maximum count only (No '> 1/unique_label_count' done).
def getMajorityLabel(neighbourList: list) -> str:
    freqList = []
    for item in neighbourList:
        for tup in freqList:
            if tup[0] == item[1]:
                tup[1] = tup[1] + 1
                break
        else:
            freqList.append([item[1],1])
    majority_label = ''
    max_freq = 0
    for item in freqList:
        if max_freq < item[1]:
            majority_label = item[0]
            max_freq = item[1]
    return majority_label

# Naive KNN
# For classification, measure distance of given (new) data to all other test data 
# and sort the distances and take the least K from these with their labels and
# find the majority label among these and assign that label to the given data.
def KNN_classifier(dataframe:pd.DataFrame,test_data:pd.DataFrame,K:int) -> None:
    req_data = dataframe 
    req_test_data = test_data
    totalTests = req_test_data.shape[0]
    totalCorrectClassifications = 0
    for tindex,trow in req_test_data.iterrows():
        nearest_neighbours = []
        for index,row in req_data.iterrows():
            nearest_neighbours.append((calculate_distance(row,trow),row['Classification']))
        nearest_neighbours.sort()
        nearest_neighbours = nearest_neighbours[0:K-1]
        if trow['Classification'] == getMajorityLabel(nearest_neighbours):
            totalCorrectClassifications = totalCorrectClassifications+1
    print(totalCorrectClassifications/totalTests *100)

# Split data into 80-20 as train and test data sets
train = data.sample(frac=0.8,random_state=8)
test = data.drop(train.index)

# Parameter K : first choice
K =  math.ceil(math.sqrt(train.shape[0]))

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

### Running the KNN on test data

In [3]:
# Actual Classification phase.
KNN_classifier(train,test,K=K)

91.39213602550478


### Using sklearn KNN to compare performance with above implementation.

In [4]:
# Required imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

# sklearn requires numerical labels for output also, so use label encoding for categories
label_encoder = LabelEncoder()
data['encoded_Classification'] = label_encoder.fit_transform(data['Classification'])

# Same split
train = data.sample(frac=0.8,random_state=8)
test = data.drop(train.index)

# sklearn requires separate input data and output labels (encoded)
trainX = train.drop(['Classification','encoded_Classification','Classification.1'],axis=1)
trainY = train['encoded_Classification']

testX = test.drop(['Classification','encoded_Classification','Classification.1'],axis=1)
testY = test['encoded_Classification']

# Parameter K : first choice
K =  math.ceil(math.sqrt(train.shape[0]))
knn_classifier = KNeighborsClassifier(n_neighbors=K)
knn_classifier.fit(trainX,trainY)

# Inbuilt function to calculate accuracy on test data set
accuracy = knn_classifier.score(testX, testY)
print(100*accuracy)

91.49840595111584


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
