# Implement Logistic Regression Algorithm on the given dataset

In [173]:
#Importing libraries
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt

In [174]:
# Reading the .CSV file, delete 2 columns from the file, checking first few rows of the file

from google.colab import files

uploaded = files.upload()
data = pd.read_csv(io.BytesIO(uploaded['BuyComputer.csv']))
data.drop(columns=['User ID',],axis=1,inplace=True)
data.head()

Saving BuyComputer.csv to BuyComputer (3).csv


Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


In [175]:
#Declare label as last column in the source file

labels = data.iloc[:, -1:]
labels

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0
...,...
395,1
396,1
397,1
398,0


In [176]:
#Declaring X as all columns excluding last

X = data.iloc[:, :-1]
X

Unnamed: 0,Age,EstimatedSalary
0,19,19000
1,35,20000
2,26,43000
3,27,57000
4,19,76000
...,...,...
395,46,41000
396,51,23000
397,50,20000
398,36,33000


In [177]:
# Number of samples in the dataset

print("Total number of rows : ", X.shape[0])

Total number of rows :  400


In [178]:
# Splitting data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=77)

In [179]:
# Total rows in training dataset

print("Total number of rows in training dataset : ", X_train.shape[0])

Total number of rows in training dataset :  320


In [180]:
# Total rows in testing dataset

print("Total number of rows in testing dataset : ", X_test.shape[0])

Total number of rows in testing dataset :  80


In [181]:
# Sacaling data

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [182]:
#Variabes to calculate sigmoid function

y_pred = []
len_x = len(X_train[0])
w = []
b = 0.2
print(len_x)

2


In [183]:
entries = len(X_train[:,0])
entries

320

In [184]:
# Sigmoid function
def sigmoid(z):
  return (1/ (1+np.exp(-z)))

In [185]:
#Loss function
def loss_func(y_true, y_pred):
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)
  log_loss = -1 * np.mean( y_true*np.log10(y_pred) + (1-y_true)*np.log10(1-y_pred)) 
  return log_loss

In [186]:
def gradients(X, y, y_hat):
    m = X.shape[0]
    dw = (1/m)*np.dot(X.T, (y_hat - y))
    db = (1/m)*np.sum((y_hat - y)) 
    return dw, db

In [187]:
def normalize(X):
  m, n = X.shape
  for i in range(n):
    X = (X - X.mean(axis = 0)) / X.std(axis = 0)
  return X

In [188]:
def train(X, y, bs, epochs, lr):
    m, n = X.shape
    w = np.zeros((n,1))
    b = 0.2

    y = y.values.reshape(m,1)
    x = normalize(X)

    losses = []

    for epoch in range(epochs):
        for i in range((m-1)//bs + 1):
            start_i = i*bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            y_hat = sigmoid(np.dot(xb, w) + b)
            dw, db = gradients(xb, yb, y_hat)
            w -= lr*dw
            b -= lr*db
        l = loss_func(y, sigmoid(np.dot(X, w) + b))
        losses.append(l)
    return w, b, losses

In [189]:
def prediction_function(inputs):
  x = normalize(inputs)
  preds = sigmoid(np.dot(inputs, w) + b)
  pred_class = []    
  pred_class = [1 if i > 0.5 else 0 for i in preds]  
  return np.array(pred_class)

In [190]:
#Repeating the process 3000 times

w, b, l = train(X_train, y_train, bs=100, epochs=3000, lr=0.01)

In [191]:
#Print weight

w

array([[2.49942443],
       [1.38806085]])

In [192]:
#print bias

b

-1.242689849546466

In [193]:
#predicting the label

from sklearn.metrics import accuracy_score

y_predict = prediction_function(X_test)
print(accuracy_score(y_test, y_predict))

0.8625


In [194]:
#print actual and predicted values in a table

y_predict = pd.DataFrame(data = y_predict, columns = ['Predicted'])


result = []

for i in range(len(y_test)):
  result.append([y_test._get_value(i, 0, takeable = True), y_predict._get_value(i, 0, takeable = True)])

result = pd.DataFrame(result, columns =['Actual', 'Predicted'])
result

Unnamed: 0,Actual,Predicted
0,1,1
1,1,1
2,0,0
3,1,1
4,0,0
...,...,...
75,0,0
76,1,1
77,0,0
78,0,0


# Using Sklearn Logistic Regression Model

In [195]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(random_state = 0)

In [196]:
# Splitting data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size = 0.2, random_state=77)

In [197]:
# Sacaling data

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [198]:
clf = LR.fit(X_train,y_train.values.ravel())

In [199]:
y_pred = clf.predict(X_test)
print(y_pred)

[1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0
 1 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0
 0 0 1 0 0 1]


In [200]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_pred))

0.85


In [201]:
#print actual and predicted values in a table

y_predict = pd.DataFrame(data = y_pred, columns = ['Predicted'])


result = []

for i in range(len(y_test)):
  result.append([y_test._get_value(i, 0, takeable = True), y_predict._get_value(i, 0, takeable = True)])

result = pd.DataFrame(result, columns =['Actual', 'Predicted'])
result

Unnamed: 0,Actual,Predicted
0,1,1
1,1,1
2,0,0
3,1,1
4,0,0
...,...,...
75,0,0
76,1,1
77,0,0
78,0,0
