In [16]:
import numpy as np
import pandas as pd

In [30]:
df = pd.read_csv('survey lung cancer.csv')
df

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,F,56,1,1,1,2,2,2,1,1,2,2,2,2,1,YES
305,M,70,2,1,1,1,1,2,2,2,2,2,2,1,2,YES
306,M,58,2,1,1,1,1,1,2,2,2,2,1,1,2,YES
307,M,67,2,1,2,1,1,2,2,1,2,2,2,1,2,YES


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [32]:
df['LUNG_CANCER'] = df['LUNG_CANCER'].apply(lambda x: 1 if x == 'YES' else 0)
yes_no_columns = ['SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE']
for column in yes_no_columns:
    df[column] = df[column].apply(lambda x: 1 if x == 2 else 0)
df['GENDER'] = df['GENDER'].apply(lambda x: 1 if x == 'M' else 0)
df

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,0,1,1,0,1,2,1,2,2,2,2,2,2,1
1,1,74,1,0,0,0,2,2,2,1,1,1,2,2,2,1
2,0,59,0,0,0,1,1,2,1,2,1,2,2,1,2,0
3,1,63,1,1,1,0,1,1,1,1,2,1,1,2,2,0
4,0,63,0,1,0,0,1,1,1,2,1,2,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,0,56,0,0,0,1,2,2,1,1,2,2,2,2,1,1
305,1,70,1,0,0,0,1,2,2,2,2,2,2,1,2,1
306,1,58,1,0,0,0,1,1,2,2,2,2,1,1,2,1
307,1,67,1,0,1,0,1,2,2,1,2,2,2,1,2,1


In [33]:
train_data = df.loc[:247,:]
X_train = train_data.drop(["LUNG_CANCER"], axis=1).values
y_train = train_data["LUNG_CANCER"].values

test_data = df.loc[247:,:]
X_test = test_data.drop(["LUNG_CANCER"], axis=1).values
y_test = test_data["LUNG_CANCER"].values

# From Scratch

In [38]:
import numpy as np

class LogisticRegressions:
    def __init__(self, alpha=0.01, num_iter=1500):
        self.alpha = alpha
        self.num_iter = num_iter
        self.theta = None

    def hypothesis(self, X):
        m = X.shape[0]
        n = X.shape[1]
        h0_x = np.zeros(m)
        
        for i in range(m):
            h0_x[i] = 0
            for j in range(n):
                h0_x[i] += X[i, j] * self.theta[j]
        
        return h0_x

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def cost(self, X, y):
        h0_x = self.hypothesis(X)
        g_x = self.sigmoid(h0_x)
        epsilon = 1e-15
        g_x = np.clip(g_x, epsilon, 1 - epsilon)
        
        m = X.shape[0]
        cost = 0
        
        for i in range(m):
            cost += (y[i] * np.log(g_x[i]) + (1 - y[i]) * np.log(1 - g_x[i]))
        
        L0 = -cost / m
        return L0
    
    def gradient(self, X, y):
        m = len(y)
        h0_x = self.hypothesis(X)
        g_x = self.sigmoid(h0_x)
        
        dL0_d0j = np.zeros(X.shape[1])
        
        for i in range(m):
            for j in range(X.shape[1]):
                dL0_d0j[j] += (g_x[i] - y[i]) * X[i, j]
        
        return dL0_d0j / m

    def gradient_descent(self, X, y, alpha=0.01, num_iter=1500):
        if self.theta is None:
            self.theta = np.zeros(X.shape[1])
        
        cost_history = []
        
        for i in range(self.num_iter):
            dL0_d0j = self.gradient(X, y)
            self.theta -= alpha * dL0_d0j
            cost = self.cost(X, y)
            cost_history.append(cost)
        
        return self.theta, cost_history

    def predict(self, X):
        h0_x = self.hypothesis(X)
        g_x = self.sigmoid(h0_x)
        return np.where(g_x >= 0.5, 1, 0)

model = LogisticRegressions(alpha=0.01, num_iter=1500)
model.gradient_descent(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = sum(y_pred == y_test) / len(y_test)
print("Model accuracy:", accuracy)

Model accuracy: 0.8387096774193549
