# Processing

### Imports

In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Reading the csv file

In [27]:
cancer_data = pd.read_csv("survey lung cancer.csv")

#### Looking some rows of the dataset

In [28]:
cancer_data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


#### Size of the dataset (rows,columns)

In [29]:
cancer_data.shape

(309, 16)

#### Checking if there are some null values

In [30]:
cancer_data.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

#### Converting text to binary numbers

In [31]:
pd.set_option('mode.chained_assignment', None)
for col in cancer_data.select_dtypes('object'):
    cancer_data[col][cancer_data[col] =='M'] = 0
    cancer_data[col][cancer_data[col] == 'F'] = 1
    cancer_data[col][cancer_data[col] == 'NO'] = 0
    cancer_data[col][cancer_data[col] == 'YES'] = 1

# LOGISTIC REGRESSION (from scratch without library)

In [32]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [33]:
def cost_function(X,y,w,b):
    m=X.shape[0]
    cost = 0.
    for i in range(m):
        fwb = sigmoid(np.dot(w,X[i])+b)
        loss = -y[i]*np.log(fwb) - (1-y[i])*np.log(1-fwb)
        cost+=loss
    return cost / m

In [34]:
def gradient_calcul(X,y,w,b):
    m,n = X.shape
    djdb = 0.
    djdw = np.zeros((n,)) # for the number of features
    for i in range(m):
        fwb = sigmoid(np.dot(X[i],w)+b)
        diff = fwb-y[i]
        djdb += diff
        for j in range(n):
            djdw[j] += diff * X[i,j]

    return djdw/m,djdb/m



In [35]:
def gradient_descent(X,y,w_in,b_in,alpha,iters):
    m,n = X.shape
    w = w_in
    b=b_in
    print(f"Initial cost : {cost_function(X,y,w,b)}")
    for iteration in range(iters):
        djdw,djdb = gradient_calcul(X,y,w_in,b_in)
        b -= alpha*djdb
        w -= alpha * djdw
    print(f"Final cost : {cost_function(X,y,w,b)}")
    return w,b

In [36]:
X = cancer_data[['GENDER','AGE','SMOKING','YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','CHRONIC DISEASE','FATIGUE ','ALLERGY ','WHEEZING','ALCOHOL CONSUMING','COUGHING','SHORTNESS OF BREATH','SWALLOWING DIFFICULTY','CHEST PAIN']]
y = cancer_data['LUNG_CANCER'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42) 
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

#### Cost at each step with the gradient descent

In [37]:
w_in = np.zeros((X.shape[1]))
b_in = 0.
alpha = 0.003
iters = 20000
w_final,b_final = gradient_descent(X_train,y_train,w_in,b_in,alpha,iters) # takes some time

Initial cost : 0.6931471805599466
Final cost : 0.3028796469997133


#### Final predictions on the test set

In [38]:
def final_prediction(X_test,y_test,w,b):
    m,n = X_test.shape
    p=np.zeros(m)
    print("Test on the test set")
    for i in range(m): # examples
        fwb = sigmoid(np.dot(X_test[i],w)+b)
        if(fwb<0.5): #threshold
            p[i]=0
        else:
            p[i]=1
    return (p==y_test).sum(),(p!=y_test).sum()
good_preds,bad_preds = final_prediction(X_test,y_test,w_final,b_final)
print(f"Good predictions : {good_preds}",f" Bad predictions : {bad_preds}")

Test on the test set
Good predictions : 58  Bad predictions : 4
