In [5]:
#importing the required libraries
#pandas is required to be able to use dataframe capabilities
import pandas as pd
import numpy as np
import math
#importing the following library just to print th dataframe in a tabular way
from IPython.display import display, HTML

#converting the csv file to a dataframe
water_df = pd.read_csv('/content/water_portability.csv')
#to check if we have successfully obtained a dataframe, we can print the first 5 rows of the dataframe using the head() function
display(water_df.head())

#the isnull() function can be used to check where all there is a NaN value in the dataframe
#and then we can run a for loop to check whether a given column has NaN values and how many (over here knowing the count of NaN values won't be of help but no harm in knowing)
#and we can append the count of NaN values in a list
NaN_value_count = []                     #creating an empty list
for column in water_df.columns:          #for loop to iterate over each column
  x = water_df[column].isnull().sum()    #assigning the number of NaN values in each column to x
  NaN_value_count.append(x)              #append x into a list (NaN_value_count)
#[491, 0, 0, 0, 781, 0, 0, 162, 0, 0] this is what we get if we print NaN_value_count, so 3 rows [ph, Sulfate, trihalomethanes] have NaN values

#we shall fill the NaN values using the mean of the respective columns
water_df['ph'] = water_df['ph'].fillna(water_df['ph'].mean())                                                 #filling the ph column
water_df['Sulfate'] = water_df['Sulfate'].fillna(water_df['Sulfate'].mean())                                  #filling the sulfate column
water_df['Trihalomethanes'] = water_df['Trihalomethanes'].fillna(water_df['Trihalomethanes'].mean())          #filling the trihalomethanes column

#we will remove the Potability column from the dataframe (we don't want to normalize it)
water_potability = water_df.pop('Potability')

#as the values in the columns are pretty high, the model might assign a higher weightage to the features with a higher scale, hence we now normalze the data
#here i will use the mean normalization method (mainly because i found the formula for the same to be quite simple. I came across a few more methods but they required more libraries to be imported and i wasn't able to understand how they work)
for column in water_df.columns:
  water_df[column] = (water_df[column] - water_df[column].mean())/water_df[column].mean()

#now we will concatenate the potability column and the normalized dataframe
water_df = pd.concat([water_df, water_potability], axis = 1)   #axis = 1 because we want the dataframes side by side

#now we split the data into training and testing as per a ratio (to determine the number of rows that will be in the training set and the rest in the testing set)
ratio = 0.8
#we do this by creating a list of 3276*ratio randomly selected numbers from a range of 0 to 3276 and assign it to the list rand_list
#the elements of rand_list essentially now correspond to the index of a row in water_df
rand_list = np.random.choice(np.arange(3276), size=int(3276*ratio), replace=False)
#next we create x_train. we take each element or rand_list and fill it into x_train
x_train = water_df.iloc[rand_list]
#now we have to fill the remaining elements into x_test
#to do this, we create remaining_list, which contains all those values in the range of 0 to 3276 which are not there in rand _list
#this can be done using the setdiff1d() function (had to search for this function online)
remaining_list = np.setdiff1d(np.arange(3276), rand_list)
#next we create x_test. we take each element or rand_list and fill it into x_test
x_test = water_df.iloc[remaining_list]
#now we create a seperate dataframe for the target variable (potability)
#just remove the Potability columns from x_train and x_test and call it y_train and y_test
y_train = x_train.pop('Potability')
y_test = x_test.pop('Potability')

#now we calculate the prior probability of each class
prior_1 = 0                           #let prior_1 initially be 0
for i in range (len(y_train)):        #we will iterate through the length of y_train
  if y_train.iloc[i] == 1:            #if the value of y_train at the ith row is 1 then we add 1 to prior_1
    prior_1 = prior_1 + 1             #we now basically have the number of times 1 is there in y_train

prior_1 = prior_1/len(y_train)        #divide it by the length of y_train to get the prior probability
prior_0 = 1 - prior_1                 #number of occurrences of 1 + number of occurrences of 0 = length of y_train

#now we find the mean and standard deviation (has to be used in the naive bayes algorithm formula)
#so we first seperate x_train into 2 different dataframes as per their respective potability values (0 and 1) according to y_train
data_1 = x_train[y_train == 1]   #data_1 contains the rows of x_train, whose potability value is 1 (we got this value from y_train)
data_0 = x_train[y_train == 0]   #data_2 contains the rows of x_train, whose potability value is 0 (we got this value from y_train)
#now we calculate mean and standard deviation using the mean() and std() functions and concatenate the values into mean_data_1 and mean_data_0
mean_data_1 = pd.concat([data_1.mean(), data_1.std()], axis=1)
mean_data_0 = pd.concat([data_0.mean(), data_0.std()], axis=1)
#so we get 2 dataframes whose column values are mean and standard deviation and rows are the features

#now we create a function which will calculate the gaussian probability for a value with respect to the respective mean and standard deviation
#the function will take the mean, standard deviation, the x value and return the corresponding likelihood
def calculate_probability(x, mean, std):
    exponent = math.exp(-((x - mean) ** 2) / (2 * (std ** 2)))
    return (1 / (np.sqrt(2 * np.pi) * std)) * exponent

#now we predict the class of the given data using the Naive Bayes Algorithm
#so we create a function that will iterate over each column of a specific row of the dataframe when we input a row into the function
def predict_instance(row):
    prob_column_1 = prior_1   #initialize prob_column_1 with prior_1
    prob_column_0 = prior_0   #initialize prob_column_0 with prior_0
#so now the function has taken a row as its input arguement and we have to iterate over each column to get each individual value in the row that is being passed as the input
    for column in x_test.columns:    #we iterate over each column in x_test
        x = row[column]      #assign 1 value to x (a specific row (which we get through iterrows()) and a specific column (which we get through x_test.columns()))
        mean_1= mean_data_1[0][column]      #now we take the mean of 1 in that specific column from mean_data_1 and store it in mean_1
        std_1 =  mean_data_1[1][column]     #now we take the standard deviation of 1 in that specific column from mean_data_1 and store it in std_1
        mean_0 = mean_data_0[0][column]     #now we take the mean of 0 in that specific column from mean_data_0 and store it in mean_2
        std_0 = mean_data_0[1][column]      #now we take the standard deviation of 0 in that specific column from mean_data_0 and store it in std_2
        #now we update prob_column_1 and prob_column_0 with each iteration by multiplying it with the calculated probabilities of each iteration
        prob_column_1 *= calculate_probability(x, mean_1, std_1)    #update prob_column_1
        prob_column_0 *= calculate_probability(x, mean_0, std_0)    #update prob_column_0
    #over here, we have iterated thorugh each column for the given row and have obtained the final probability values for that specific row
    #if prob of 1 > prob of 0 (meaning that our prediction is 1) we return 1 and if not then we return 0
    if prob_column_1 > prob_column_0:
      return 1
    else:
      return 0

#now we make a list of the predictions of our model
#so we pass each row of x_test into our predict_instance function using iterrows()
#if prob_column_1 > prob_column_0 we will have a value of 1 (our prediction (potable)) else 0 (not potable)
predictions = [predict_instance(row) for index, row in x_test.iterrows()]

#now we can print the accuracy rate
#this can be done by checking the value at a particular index in predictions and at that same index in y_test. if the values are equal then return 1. and then take the sum of all the ones
# Calculate the number of correct predictions using a different variable name
sum_of_ones = 0                           #initialize sum_of_ones to 0
for i in range (len(y_test)):             #iterate over each value
  if y_test.iloc[i] == predictions[i]:    #if the prediction is correct
    sum_of_ones += 1         #add 1 to sum_of_ones

#calculate the acuracy
accuracy_percent = (sum_of_ones / len(y_test)) * 100.0   #accuracy = correct predictions / total predictions and then multiply by 100 to get in percentage
#print the accuracy percent
print("Accuracy Percentage:", accuracy_percent)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


Accuracy Percentage: 63.71951219512195
