In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  # visualization
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
from sklearn import svm


In [164]:
# loading the tips dataset to a pandas DataFrame
tips_dataset = pd.read_csv('tips.csv') 
# printing the first 5 rows of the dataset
tips_dataset.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [166]:
# number of rows and Columns in this dataset
tips_dataset.shape

(244, 7)

In [168]:
# getting the statistical measures of the data
tips_dataset.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [170]:
# separating the data and labels
X = tips_dataset.drop(columns = 'size', axis=1)
Y = tips_dataset['size']
print(X)
print(Y)

     total_bill   tip     sex smoker   day    time
0         16.99  1.01  Female     No   Sun  Dinner
1         10.34  1.66    Male     No   Sun  Dinner
2         21.01  3.50    Male     No   Sun  Dinner
3         23.68  3.31    Male     No   Sun  Dinner
4         24.59  3.61  Female     No   Sun  Dinner
..          ...   ...     ...    ...   ...     ...
239       29.03  5.92    Male     No   Sat  Dinner
240       27.18  2.00  Female    Yes   Sat  Dinner
241       22.67  2.00    Male    Yes   Sat  Dinner
242       17.82  1.75    Male     No   Sat  Dinner
243       18.78  3.00  Female     No  Thur  Dinner

[244 rows x 6 columns]
0      2
1      3
2      3
3      2
4      4
      ..
239    3
240    2
241    2
242    2
243    2
Name: size, Length: 244, dtype: int64


In [172]:
# *** Handling Missing Values ***
columns_with_missing_values = ['total_bill', 'tip']  # Replace with your actual columns
print("\nHandling missing values in specific columns:")
for column in columns_with_missing_values:
    print(f"Number of missing values in '{column}' before handling: {X[column].isnull().sum()}")
    X.dropna(subset=[column], inplace=True)  # Remove rows with missing values in the specified column
    print(f"Number of missing values in '{column}' after handling: {X[column].isnull().sum()}")


Handling missing values in specific columns:
Number of missing values in 'total_bill' before handling: 0
Number of missing values in 'total_bill' after handling: 0
Number of missing values in 'tip' before handling: 0
Number of missing values in 'tip' after handling: 0


In [174]:
# Define mappings for categorical features
sex_mapping = {'Female': 0, 'Male': 1}
smoker_mapping = {'No': 0, 'Yes': 1}
day_mapping = {'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun': 3}
time_mapping = {'Lunch': 0, 'Dinner': 1}

# Apply mappings to the DataFrame
X['sex'] = X['sex'].map(sex_mapping)
X['smoker'] = X['smoker'].map(smoker_mapping)
X['day'] = X['day'].map(day_mapping)
X['time'] = X['time'].map(time_mapping)

In [176]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2,  random_state=42)
print(X.shape, X_train.shape, X_test.shape)


(244, 6) (195, 6) (49, 6)


In [178]:
# Initialize KNN classifier
classifier = KNeighborsClassifier(n_neighbors=1)  # Fewer neighbors usually give better results
classifier.fit(X_train, Y_train)  # Train the model

# Test on test data and calculate accuracy
y_prediction = classifier.predict(X_test)  # Predict test data
accuracy_test_data = accuracy_score(Y_test, y_prediction)  # Accuracy for test data
print("Test data accuracy for n_neighbors=1:", accuracy_test_data)

Test data accuracy for n_neighbors=1: 0.5306122448979592


In [182]:

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert Y to one-hot encoding for multi-class classification
Y_train_one_hot = np.eye(len(np.unique(Y)))[Y_train - 1]  # Adjust indexing
Y_test_one_hot = np.eye(len(np.unique(Y)))[Y_test - 1]

# Define the neural network class (same as your previous code, ensure init is corrected to __init__)
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        self.weights_input_hidden = np.random.randn(input_size, hidden_size) * 0.01
        self.bias_hidden = np.zeros((1, hidden_size))
        self.weights_hidden_output = np.random.randn(hidden_size, output_size) * 0.01
        self.bias_output = np.zeros((1, output_size))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, z):
        return z * (1 - z)

    def softmax(self, z):
        exp_values = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_values / np.sum(exp_values, axis=1, keepdims=True)

    def forward(self, X):
        self.hidden_layer_input = np.dot(X, self.weights_input_hidden) + self.bias_hidden
        self.hidden_layer_output = self.sigmoid(self.hidden_layer_input)
        self.output_layer_input = np.dot(self.hidden_layer_output, self.weights_hidden_output) + self.bias_output
        self.output = self.softmax(self.output_layer_input)
        return self.output

    def backward(self, X, Y, learning_rate):
        output_error = self.output - Y
        output_gradient = output_error / X.shape[0]
        hidden_error = np.dot(output_gradient, self.weights_hidden_output.T) * self.sigmoid_derivative(self.hidden_layer_output)
        self.weights_hidden_output -= learning_rate * np.dot(self.hidden_layer_output.T, output_gradient)
        self.bias_output -= learning_rate * np.sum(output_gradient, axis=0, keepdims=True)
        self.weights_input_hidden -= learning_rate * np.dot(X.T, hidden_error)
        self.bias_hidden -= learning_rate * np.sum(hidden_error, axis=0, keepdims=True)

    def train(self, X, Y, epochs, learning_rate):
        for epoch in range(epochs):
            self.forward(X)
            self.backward(X, Y, learning_rate)
            if epoch % 100 == 0:
                loss = -np.sum(Y * np.log(self.output)) / X.shape[0]
                print(f"Epoch {epoch}, Loss: {loss:.4f}")

    def predict(self, X):
        probabilities = self.forward(X)
        return np.argmax(probabilities, axis=1) + 1  # Adjust to match target classes

# Initialize and train the neural network
nn = NeuralNetwork(input_size=X_train.shape[1], hidden_size=8, output_size=len(np.unique(Y)))
nn.train(X_train, Y_train_one_hot, epochs=1000, learning_rate=0.01)

# Predict and calculate accuracy
Y_pred = nn.predict(X_test)
accuracy = np.mean(Y_pred == Y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Epoch 0, Loss: 1.7847
Epoch 100, Loss: 1.2741
Epoch 200, Loss: 1.1480
Epoch 300, Loss: 1.1043
Epoch 400, Loss: 1.0800
Epoch 500, Loss: 1.0632
Epoch 600, Loss: 1.0504
Epoch 700, Loss: 1.0401
Epoch 800, Loss: 1.0314
Epoch 900, Loss: 1.0238
Test Accuracy: 61.22%
