# Processing

### Imports

In [157]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


#### Reading the csv file

In [158]:
cancer_data = pd.read_csv("survey lung cancer.csv")

#### Looking some rows of the dataset

In [159]:
cancer_data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


#### Size of the dataset (rows,columns)

In [160]:
cancer_data.shape

(309, 16)

#### Checking if there are some null values

In [161]:
cancer_data.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

#### Converting text to binary numbers

In [162]:
pd.set_option('mode.chained_assignment', None)
for col in cancer_data.select_dtypes('object'):
    cancer_data[col][cancer_data[col] =='M'] = 0
    cancer_data[col][cancer_data[col] == 'F'] = 1
    cancer_data[col][cancer_data[col] == 'NO'] = 0
    cancer_data[col][cancer_data[col] == 'YES'] = 1
cancer_data

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,0,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,1,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,0,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,1,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,1,56,1,1,1,2,2,2,1,1,2,2,2,2,1,1
305,0,70,2,1,1,1,1,2,2,2,2,2,2,1,2,1
306,0,58,2,1,1,1,1,1,2,2,2,2,1,1,2,1
307,0,67,2,1,2,1,1,2,2,1,2,2,2,1,2,1


# LOGISTIC REGRESSION (from scratch without librairy)

In [163]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [164]:
def cost_function(X,y,w,b):
    m=X.shape[0]
    cost = 0.
    for i in range(m):
        fwb = sigmoid(np.dot(w,X[i])+b)
        loss = -y[i]*np.log(fwb) - (1-y[i])*np.log(1-fwb)
        cost+=loss
    return cost / m

In [165]:
def gradient_calcul(X,y,w,b):
    m,n = X.shape
    djdb = 0.
    djdw = np.zeros((n,)) # for the number of features
    for i in range(m):
        fwb = sigmoid(np.dot(X[i],w)+b)
        diff = fwb-y[i]
        djdb += diff
        for j in range(n):
            djdw[j] += diff * X[i,j]

    return djdw/m,djdb/m



In [166]:
def gradient_descent(X,y,w_in,b_in,alpha,iters):
    m,n = X.shape
    w = w_in
    b=b_in
    print(f"Initial cost : {cost_function(X,y,w,b)}")
    for iteration in range(iters):
        djdw,djdb = gradient_calcul(X,y,w_in,b_in)
        b -= alpha*djdb
        w -= alpha * djdw
        print(f"Cost : {cost_function(X,y,w,b)}")
    return w,b

In [167]:
X = cancer_data[['GENDER','AGE','SMOKING','YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','CHRONIC DISEASE','FATIGUE ','ALLERGY ','WHEEZING','ALCOHOL CONSUMING','COUGHING','SHORTNESS OF BREATH','SWALLOWING DIFFICULTY','CHEST PAIN']]
y = cancer_data['LUNG_CANCER'] 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, random_state=42) 
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()

#### Cost at each step with the gradient descent

In [168]:
w_in = np.zeros((X.shape[1]))
b_in = 0.
alpha = 0.003
iters = 20000
w_final,b_final = gradient_descent(X_train,y_train,w_in,b_in,alpha,iters) # takes some time

Initial cost : 0.6931471805599466
Cost : 0.6234005224343493
Cost : 0.4572469904310165
Cost : 0.4184944390898851
Cost : 0.4183535785964078
Cost : 0.4182756563690089
Cost : 0.4182157603104913
Cost : 0.4181578797629574
Cost : 0.41810106968284316
Cost : 0.4180442216915255
Cost : 0.4179875171209998
Cost : 0.41793081299274837
Cost : 0.41787416064478633
Cost : 0.4178175346363252
Cost : 0.4177609457970533
Cost : 0.41770438915990854
Cost : 0.4176478668916774
Cost : 0.4175913779892702
Cost : 0.4175349228685323
Cost : 0.41747850131220154
Cost : 0.41742211338595814
Cost : 0.4173657590291321
Cost : 0.4173094382375213
Cost : 0.4172531509817145
Cost : 0.4171968972435818
Cost : 0.4171406769999694
Cost : 0.4170844902299865
Cost : 0.4170283369117472
Cost : 0.41697221702382964
Cost : 0.41691613054462107
Cost : 0.41686007745261133
Cost : 0.41680405772626256
Cost : 0.41674807134406594
Cost : 0.41669211828451735
Cost : 0.41663619852612904
Cost : 0.4165803120474227
Cost : 0.41652445882693256
Cost : 0.4164686

#### Final predictions on the test set

In [169]:
def final_prediction(X_test,y_test,w,b):
    m,n = X_test.shape
    p=np.zeros(m)
    print("Test on the test set")
    for i in range(m): # examples
        fwb = sigmoid(np.dot(X_test[i],w)+b)
        if(fwb<0.5): #threshold
            p[i]=0
        else:
            p[i]=1
    print(f"Good cancer predictions : {(p==y_test).sum()}")
    print(f"Bad predictions : {(p!=y_test).sum()}")
final_prediction(X_test,y_test,w_final,b_final)

Test on the test set
Good cancer predictions : 58
Bad predictions : 4
