# Lung Cancer Prediction
<hr>

### Lung Cancer Prediction using Random Forest

In [None]:
import pandas as pd
import numpy as np
import seaborn as sbn
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn import linear_model
from sklearn import ensemble
from sklearn import svm
from sklearn.model_selection import train_test_split

#### Preprocessing the dataset

In [None]:
# Loading dataset into dataset
data_set = pd.read_csv('./dataset/lung-cancer-dataset.csv')

In [None]:
# Check if the dataset is loaded
data_set.head(5)

In [None]:
# Get the summary of dataset
data_set.info()

In [None]:
# Checking if the null values in the dataset
data_set.isnull().sum()

In [None]:
# Setting up a new columns for Gender and Lung cancer results to represent as a numeric data
labeled_gender = LabelEncoder()
labeled_lung_cancer = LabelEncoder()

data_set['GENDER_N'] = labeled_gender.fit_transform(data_set['GENDER'])
data_set['LUNG_CANCER_N'] = labeled_lung_cancer.fit_transform(data_set['LUNG_CANCER'])

In [None]:
data_set.head()

In [None]:
# Dropping Gender and Lung cancer results columns as they represent string 
updated_data_set = data_set.drop(['GENDER','LUNG_CANCER'], axis = 'columns')

# Export updated data set as a new csv
updated_data_set.to_csv('./dataset/lung-cancer-dataset-updated.csv', index=False)
updated_data_set.head()

#### Data analysis & visualization

In [None]:
# Visualize smoking posibilities
sbn.countplot(x='SMOKING',data = updated_data_set)

In [None]:
# Visualize lung cancer posibilities
sbn.countplot(x='LUNG_CANCER_N',data = updated_data_set)

In [None]:
correlation=updated_data_set.corrwith(updated_data_set['LUNG_CANCER_N'], method='pearson')
correlation.plot(kind='bar')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sbn.violinplot(x='LUNG_CANCER_N', y='SMOKING', data=updated_data_set)

In [None]:
plt.figure(figsize=(10,10))
sbn.swarmplot(x='LUNG_CANCER_N', y='AGE', data=updated_data_set)

In [None]:
figures = plt.figure(figsize = (20,25))
axis = figures.gca()
updated_data_set.hist(ax = axis)

In [None]:
lc0_data_set = updated_data_set[updated_data_set.LUNG_CANCER_N == 0]
lc1_data_set = updated_data_set[updated_data_set.LUNG_CANCER_N == 1]

In [None]:
plt.xlabel('Age')
plt.ylabel('Smoking')
plt.scatter(lc0_data_set['AGE'],lc0_data_set['SMOKING'], color='red', marker='*')
plt.scatter(lc1_data_set['AGE'],lc1_data_set['SMOKING'], color='green', marker='.')

In [None]:
input_data_set = updated_data_set.drop('LUNG_CANCER_N', axis = 'columns')
target_data_set = updated_data_set['LUNG_CANCER_N']

In [None]:
input_data_set

In [None]:
target_data_set

In [None]:
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(input_data_set, target_data_set, test_size = 0.25, random_state= 40)

In [None]:
RFmodel = ensemble.RandomForestClassifier(n_estimators = 200)
RFmodel.fit(X_train_data,y_train_data)

In [None]:
RFmodel.score(X_test_data,y_test_data)

In [None]:
def get_inputs():
    inputs = []
    print('Please enter your Information:')
    
    age = input("Enter your AGE ?\n") 
    inputs.append(age)
    smoking = input("Do you SMOKING ? Yes=2, No=1 \n") 
    inputs.append(smoking)
    yellow_fingers = input("Do you have YELLOW FINGERS ? Yes=2, No=1 \n")
    inputs.append(yellow_fingers)
    anxiety = input("Do you have ANXIETY ? Yes=2, No=1 \n") 
    inputs.append(anxiety)
    peer_pressure = input("Do you have PEER PRESSURE ? Yes=2, No=1 \n") 
    inputs.append(peer_pressure)
    chronic_disease = input("Do you have CHRONIC DISEASE ? Yes=2, No=1 \n") 
    inputs.append(chronic_disease)
    fatigue = input("Do you have FATIGUE ? Yes=2, No=1 \n")
    inputs.append(fatigue)
    allergy = input("Do you have ALLERGY ? Yes=2, No=1 \n") 
    inputs.append(allergy)
    wheezing = input("Do you have WHEEZING ? Yes=2, No=1 \n") 
    inputs.append(wheezing)
    alcohol = input("Do you drink too much alcohol ? Yes=2, No=1 \n") 
    inputs.append(alcohol)
    coughing = input("Do you have COUGHING ? Yes=2, No=1 \n") 
    inputs.append(coughing)
    breath = input("Do you have SHORTNESS OF BREATH ? Yes=2, No=1 \n") 
    inputs.append(breath)
    swallowing  = input("Do you have DIFFICULTY SWALLOWING ? Yes=2, No=1 \n") 
    inputs.append(swallowing)
    chest  = input("Do you have CHEST PAIN ? Yes=2, No=1 \n") 
    inputs.append(chest)
    gender = input("Enter your GENDER ? F=0, M=1 \n")
    inputs.append(gender)
    
    result = RFmodel.predict([inputs])
    
    print("\nResult")
    if(result[0] == 1):
        print("You have lung cancer")
    else:
        print("You don't have lung cancer")
    

get_inputs()
