# logistic regression using numpy

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## data loading

In [41]:
df =  pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%203/data/ChurnData.csv")
df

Unnamed: 0,tenure,age,address,income,ed,employ,equip,callcard,wireless,longmon,...,pager,internet,callwait,confer,ebill,loglong,logtoll,lninc,custcat,churn
0,11.0,33.0,7.0,136.0,5.0,5.0,0.0,1.0,1.0,4.40,...,1.0,0.0,1.0,1.0,0.0,1.482,3.033,4.913,4.0,1.0
1,33.0,33.0,12.0,33.0,2.0,0.0,0.0,0.0,0.0,9.45,...,0.0,0.0,0.0,0.0,0.0,2.246,3.240,3.497,1.0,1.0
2,23.0,30.0,9.0,30.0,1.0,2.0,0.0,0.0,0.0,6.30,...,0.0,0.0,0.0,1.0,0.0,1.841,3.240,3.401,3.0,0.0
3,38.0,35.0,5.0,76.0,2.0,10.0,1.0,1.0,1.0,6.05,...,1.0,1.0,1.0,1.0,1.0,1.800,3.807,4.331,4.0,0.0
4,7.0,35.0,14.0,80.0,2.0,15.0,0.0,1.0,0.0,7.10,...,0.0,0.0,1.0,1.0,0.0,1.960,3.091,4.382,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,55.0,44.0,24.0,83.0,1.0,23.0,0.0,1.0,0.0,17.35,...,0.0,0.0,0.0,1.0,0.0,2.854,3.199,4.419,3.0,0.0
196,34.0,23.0,3.0,24.0,1.0,7.0,0.0,1.0,0.0,6.00,...,0.0,0.0,1.0,1.0,0.0,1.792,3.332,3.178,3.0,0.0
197,6.0,32.0,10.0,47.0,1.0,10.0,0.0,1.0,0.0,3.85,...,0.0,0.0,1.0,1.0,0.0,1.348,3.168,3.850,3.0,0.0
198,24.0,30.0,0.0,25.0,4.0,5.0,0.0,1.0,1.0,8.70,...,1.0,1.0,1.0,1.0,1.0,2.163,3.866,3.219,4.0,1.0


## data cleaning

now we will clean the data of any useless features or just choose the useful ones

after observing the data let us assume that the fields we use are 'tenure', 'age', 'address', 'income', 'ed', 'employ', 'equip' and of course the target ('churn').

In [42]:
df = df[["tenure", "age", "address", "income", "ed", "employ", "equip", "churn"]]

while observing the data notice that the churn is not (int) but float instead so now we fix this 

In [43]:
df['churn'] = df['churn'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['churn'] = df['churn'].astype('int')


## data preprocessing

In [44]:
# slicing the data into X and Y
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

It is also a norm to standardize or normalize the dataset in order to have all the features at the same scale. This helps the model learn faster and improves the model performance.

In [45]:
# standard scaler formula = (X - mean of the feature) / the standard deviation  

def standard_scaler(X):
    # calculating the mean and standard derivation
    mean = np.mean(X,axis=0)
    std = np.std(X,axis=0)

    # Handle the case where std might be 0 to avoid division by zero
    std_replaced = np.where(std == 0, 1, std)

    X_scaled = (X - mean) / std_replaced

    return X_scaled, mean, std

X_scaled, mu, sigma = standard_scaler(X)

In [46]:
print(f"X shape: {X_scaled.shape}")
print(f"y shape: {y.shape}")

X shape: (200, 7)
y shape: (200,)


notice that the data y is 1d array which is wrong as explained in the logistic regression article it must be a matrix of 1 column.

In [47]:
y = y.reshape(-1, 1) # Changes shape from (1067,) to (1067, 1)

now looking at the X matrix in the mathematical article we notice that the input data has an additional column of ones because of the $X_0$ = 1 in the linear equation

$$f(x) = \theta_0 + (x_1 * \theta_1) + (x_2 * \theta_2)$$

reminder:
$$
X =
\begin{bmatrix}
1 & x_1^{(1)} & \cdots & x_i^{(1)} \\
1 & x_1^{(2)} & \cdots & x_i^{(2)} \\
\vdots & \vdots &   & \vdots\\
1 & x_1^{(m)} & \cdots & x_i^{(m)}
\end{bmatrix}
$$

In [48]:
m = X_scaled.shape[0]
X_final = np.c_[np.ones(m), X_scaled] # Adds a column of 1s at the start

In [49]:
print(f"X shape: {X_final.shape}")
print(f"y shape: {y.shape}")

X shape: (200, 8)
y shape: (200, 1)


now that we are done the final part is to split the data

In [50]:
def train_test_split(X, Y, split):

    # creates a simple list of numbers from $0$ up to the number of samples you have.
    # Instead of moving the actual data around (which is heavy), we just shuffle these numbers. It’s much faster.
    indices = np.array(range(len(X)))

    # calculates how many rows will go into your "Training" bucket.
    train_size = round(split * len(X))

    # Randomize the "ID cards"
    np.random.shuffle(indices)

    # assigning the needed indices for training and the rest for testing 
    train_indices = indices[0:train_size]
    test_indices = indices[train_size:len(X)]

    # extracting the data for the X_train, X_test, Y_train and Y_test
    X_train = X[train_indices, :]
    X_test = X[test_indices, :]
    Y_train = Y[train_indices, :]
    Y_test = Y[test_indices, :]
    
    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = train_test_split(X_final, y, .8)

## model training

In [51]:
# defining the sigmoid function
def sigmoid(z): 
	return 1 / (1 + np.exp(-z))

In [52]:
class Logistic_regression:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.theta = None
        self.cost_history = []

    def fit(self, x, y):
        n_samples, n_features = x.shape
        self.theta = np.random.rand(n_features, 1)

        for i in range(self.n_iters):
            # 1. Calculate z
            z = np.dot(x, self.theta)

            # 2. Apply Sigmoid 
            predictions = sigmoid(z)
            
            # 3. Calculate errors
            error = predictions - y
            
            # 4. Calculate gradient (Exactly as you derived it!)
            gradient = (1/n_samples) * np.dot(x.T, error)
            
            # 5. Update theta
            self.theta -= (self.lr * gradient)

            # 6. Calculate Log Loss Cost (Translating your math to code)
            epsilon = 1e-9
            cost = (-1 / n_samples) * np.sum(y * np.log(predictions + epsilon) + (1 - y) * np.log(1 - predictions + epsilon))
            self.cost_history.append(cost)

    def predict(self, X):
        z = np.dot(X, self.theta)
        return np.round(sigmoid(z))

In [53]:
model = Logistic_regression(.01, 1000)
model.fit(X_train, Y_train)

In [57]:
prediction = model.predict(X_test)
prediction[0:20]

array([[0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.]])