In [None]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as pp
from sklearn.model_selection import train_test_split

In [None]:
# 1: Download and load the data (csv file contains ';' as delimiter)
df = pd.read_csv('divorce.csv', delimiter=';')
df.head()

Unnamed: 0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,Atr10,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,2,2,4,1,0,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,4,4,4,4,4,0,0,4,4,4,...,2,2,3,4,4,4,4,2,2,1
2,2,2,2,2,1,3,2,1,1,2,...,3,2,3,1,1,1,2,2,2,1
3,3,2,3,2,3,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,2,2,1,1,1,1,0,0,0,0,...,2,1,2,3,2,2,2,1,0,1


In [None]:
# check for any empty rows or columns that need removing
df.isna().sum().sum()

0

In [None]:
# 2: Add column at position 0 with all values=1 (pandas.DataFrame.insert function). 
#    This is for input to the bias
df.insert(0, 'b0', 1)
df.head()

Unnamed: 0,b0,Atr1,Atr2,Atr3,Atr4,Atr5,Atr6,Atr7,Atr8,Atr9,...,Atr46,Atr47,Atr48,Atr49,Atr50,Atr51,Atr52,Atr53,Atr54,Class
0,1,2,2,4,1,0,0,0,0,0,...,2,1,3,3,3,2,3,2,1,1
1,1,4,4,4,4,4,0,0,4,4,...,2,2,3,4,4,4,4,2,2,1
2,1,2,2,2,2,1,3,2,1,1,...,3,2,3,1,1,1,2,2,2,1
3,1,3,2,3,2,3,3,3,3,3,...,2,2,3,3,3,3,2,2,2,1
4,1,2,2,1,1,1,1,0,0,0,...,2,1,2,3,2,2,2,1,0,1


In [None]:
# 3: Define X matrix (independent features) and y vector (target feature) as numpy arrays
X = np.array(df.iloc[:, 0:55])
y = np.array(df.iloc[:, 55])
# 4: Print the shape and datatype of both X and y
print("Shape of input matrix X:", X.shape)
print('dtype of input matrix X:', X.dtype)
print("Shape of target vector y:", y.shape)
print('dtype of target vector y:', y.dtype)

Shape of input matrix X: (170, 55)
dtype of input matrix X: int64
Shape of target vector y: (170,)
dtype of target vector y: int64


In [None]:
# 5: Split the dataset into 85% for training and rest 15% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [None]:
# 6: Follow logistic regression class and fill code where highlighted:
# 7: Write sigmoid function to predict probabilities
# 8: Write log likelihood function
# 9: Write fit function where gradient ascent is implemented
# 10: Write predict_proba function where we predict probabilities for input data
class MyLogisticRegression:

    def __init__(self, epochs=25, learning_rate=0.05):
        self.epochs = epochs
        self.lr = learning_rate

    def sigmoid(self, z): return 1 / (1 + np.exp(-z))

    def log_likelihood(self, X, y, weights):
        z = np.dot(X, weights)
        ll = y * np.log(self.sigmoid(z)) + (1 - y) * np.log(1 - self.sigmoid(z))
        return -ll / len(X)

    def fit(self, X, y):
        self.loss = []
        weights = np.random.rand(X.shape[1])
        N = len(X)
                 
        for _ in range(self.epochs):        
            # Gradient Descent
            y_hat = self.sigmoid(np.dot(X, weights))
            losslog = log_likelihood(self, X, y, weights) 
            weights -= self.lr * losslog
            self.loss.append(losslog)
        self.weights = weights

    def predict_proba(self, X):
        z = np.dot(X, self.weights)
        return np.array([1 if i > 0.5 else 0 for i in self.sigmoid(z)])
        

In [None]:
# 11: Train the model
model = MyLogisticRegression(epochs=50, learning_rate=0.05)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)

In [None]:
def accuracy(y_true, y_hat):
    count = 0
    if (len(y_true) != len(y_hat)):
        return "Error, size mismatch"
    else:
        for i in range(len(y_hat)):
            if y_hat[i] == y_true[i]:
                count += 1
        return count / len(y_hat)

In [None]:
print(f"The accuracy of our model is {accuracy(y_test, y_pred)*100:.3f}% (rounded to three decimal places)")
print(f"The accuracy of our model on training data is {accuracy(y_train, model.predict_proba(X_train))*100:.3f}% (rounded to three decimal places)")

The accuracy of our model is 100.000% (rounded to three decimal places)
The accuracy of our model on training data is 95.833% (rounded to three decimal places)
