Installing libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier


Data collection and analysis

In [None]:
#loading the data from csv file to a pandas Dataframe
lung_data = pd.read_csv('/content/survey lung cancer.csv')

In [None]:
#printing the first 5 rows of the dataframe
lung_data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [None]:
#Number of columns and rows in the dataframe
lung_data.shape

(309, 16)

In [None]:
#getting more information about the dataset
lung_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [None]:
#Checking for missing values in each column
lung_data.isnull().sum()

Unnamed: 0,0
GENDER,0
AGE,0
SMOKING,0
YELLOW_FINGERS,0
ANXIETY,0
PEER_PRESSURE,0
CHRONIC DISEASE,0
FATIGUE,0
ALLERGY,0
WHEEZING,0


In [None]:
#getting some statistical measures about the data
lung_data.describe()

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,62.673139,1.563107,1.569579,1.498382,1.501618,1.504854,1.673139,1.556634,1.556634,1.556634,1.579288,1.640777,1.469256,1.556634
std,8.210301,0.496806,0.495938,0.500808,0.500808,0.500787,0.469827,0.497588,0.497588,0.497588,0.494474,0.480551,0.499863,0.497588
min,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,57.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,62.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
75%,69.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,87.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [None]:
#Distribution of target variable
lung_data['LUNG_CANCER'].value_counts()

Unnamed: 0_level_0,count
LUNG_CANCER,Unnamed: 1_level_1
YES,270
NO,39


YES ----> Lung cancer Positive
No -----> Lung cancer Negative


In [None]:
# Grouping the data based on the target variable and calculating the mean for numerical columns only
lung_data.groupby('LUNG_CANCER').agg({col: 'mean' for col in lung_data.select_dtypes(include=np.number).columns})
#The above code will select columns with numeric data types using .select_dtypes and calculate the mean for those columns only.

Unnamed: 0_level_0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
LUNG_CANCER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
NO,60.74359,1.487179,1.333333,1.307692,1.25641,1.358974,1.487179,1.128205,1.230769,1.179487,1.25641,1.564103,1.128205,1.307692
YES,62.951852,1.574074,1.603704,1.525926,1.537037,1.525926,1.7,1.618519,1.603704,1.611111,1.625926,1.651852,1.518519,1.592593


Data preprocessing

Separating the features and target


In [None]:
X = lung_data.drop(columns = 'LUNG_CANCER', axis=1)
Y = lung_data['LUNG_CANCER']

In [None]:
print(X)

    GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0        M   69        1               2        2              1   
1        M   74        2               1        1              1   
2        F   59        1               1        1              2   
3        M   63        2               2        2              1   
4        F   63        1               2        1              1   
..     ...  ...      ...             ...      ...            ...   
304      F   56        1               1        1              2   
305      M   70        2               1        1              1   
306      M   58        2               1        1              1   
307      M   67        2               1        2              1   
308      M   62        1               1        1              2   

     CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  \
0                  1         2         1         2                  2   
1                  2         2       

In [None]:
print(Y)

0      YES
1      YES
2       NO
3       NO
4       NO
      ... 
304    YES
305    YES
306    YES
307    YES
308    YES
Name: LUNG_CANCER, Length: 309, dtype: object


Splitting the data to training data and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
#Addressing class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(309, 15) (247, 15) (62, 15)


In [None]:
scaler = StandardScaler()

In [None]:
# Convert 'GENDER' column to numerical representation using one-hot encoding
X = pd.get_dummies(X, columns=['GENDER'], drop_first=True)
# drop_first=True to avoid multicollinearity

# Re-split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Now you can scale the data
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(X_train)

[[ 0.93843316 -1.18239513 -1.13447655 ... -0.9258201   0.91084007
   0.9486833 ]
 [ 1.45525142 -1.18239513  0.88146379 ... -0.9258201   0.91084007
   0.9486833 ]
 [-0.61202163  0.84574096 -1.13447655 ... -0.9258201   0.91084007
   0.9486833 ]
 ...
 [ 1.84286512 -1.18239513  0.88146379 ... -0.9258201  -1.09788758
   0.9486833 ]
 [ 2.10127425  0.84574096 -1.13447655 ...  1.08012345  0.91084007
   0.9486833 ]
 [-1.12883989  0.84574096 -1.13447655 ...  1.08012345  0.91084007
  -1.05409255]]


Model training

Support vector machine model

In [None]:
model = svm.SVC(kernel='linear', class_weight='balanced')

In [None]:
#training the SVM model with training data
model.fit(X_train, Y_train)

Model Evaluation

Accuracy score

In [None]:
#Accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy score of training data : ', training_data_accuracy)

Accuracy score of training data :  0.9392712550607287


In [None]:
#Accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy score of test data : ', test_data_accuracy)

Accuracy score of test data :  0.9354838709677419


Building a predictive system


In [None]:
# Ensure model is trained (fitted) before predicting
model.fit(X_train, Y_train)  # This line should be executed before prediction

input_data = ('M,58,2,1,1,1,1,2,2,2,2,2,2,1,2')

# 1. Convert 'F' to numerical equivalent based on your model's training
gender_mapping = {'M': 1, 'F': 0}  # Example mapping, adjust if needed
# Split the input string by comma and convert to list
input_data_list = input_data.split(',')

# Create a DataFrame from the input data, use original lung_data columns to include 'GENDER'
input_df = pd.DataFrame([input_data_list], columns=lung_data.drop(columns=['LUNG_CANCER'], axis=1).columns)

# Apply one-hot encoding to the input data
input_df = pd.get_dummies(input_df, columns=['GENDER'], drop_first=True)

# Now convert all elements to float, except the one-hot encoded columns
numerical_cols = [col for col in input_df.columns if col not in ['GENDER_M']]  # Exclude one-hot columns
input_df[numerical_cols] = input_df[numerical_cols].astype(float)

# Get column names from the original DataFrame before scaling
X_train_columns = lung_data.drop(columns=['LUNG_CANCER', 'GENDER'], axis=1).columns.tolist() + ['GENDER_M']
# OR if you have saved a copy of X before scaling use it here
# X_train_columns = X.columns

# Ensure the input DataFrame has the same columns as the training data
missing_cols = set(X_train_columns) - set(input_df.columns)
for col in missing_cols:
    input_df[col] = 0  # Add missing columns with value 0

# Reorder columns to match training data
input_df = input_df[X_train_columns]

# Scale using the same scaler
std_data = scaler.transform(input_df)

prediction = model.predict(std_data)  # Now prediction should work
print("prediction:", prediction)

# *** Print the predicted probability (confidence) ***
prediction_probability = model.decision_function(std_data)


if (prediction[0] == 'NO'): # *** corrected prediction output value ***
    print('The person does not have lung cancer')
else:
    print('The person has lung cancer')

prediction: ['YES']
The person has lung cancer


Saving the trained model

In [None]:
import pickle

In [None]:
filename = 'lung_trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
#loading the saved model
loaded_model = pickle.load(open('lung_trained_model.sav', 'rb'))

In [None]:
input_data = ('M,58,2,1,1,1,1,2,2,2,2,2,2,1,2')

# 1. Convert 'F' to numerical equivalent based on your model's training
gender_mapping = {'M': 1, 'F': 0}  # Example mapping, adjust if needed
# Split the input string by comma and convert to list
input_data_list = input_data.split(',')

# Create a DataFrame from the input data, use original lung_data columns to include 'GENDER'
input_df = pd.DataFrame([input_data_list], columns=lung_data.drop(columns=['LUNG_CANCER'], axis=1).columns)

# Apply one-hot encoding to the input data
input_df = pd.get_dummies(input_df, columns=['GENDER'], drop_first=True)

# Now convert all elements to float, except the one-hot encoded columns
numerical_cols = [col for col in input_df.columns if col not in ['GENDER_M']]  # Exclude one-hot columns
input_df[numerical_cols] = input_df[numerical_cols].astype(float)

# Get column names from the original DataFrame before scaling
X_train_columns = lung_data.drop(columns=['LUNG_CANCER', 'GENDER'], axis=1).columns.tolist() + ['GENDER_M']
# OR if you have saved a copy of X before scaling use it here
# X_train_columns = X.columns

# Ensure the input DataFrame has the same columns as the training data
missing_cols = set(X_train_columns) - set(input_df.columns)
for col in missing_cols:
    input_df[col] = 0  # Add missing columns with value 0

# Reorder columns to match training data
input_df = input_df[X_train_columns]

# Scale using the same scaler
std_data = scaler.transform(input_df)

prediction = model.predict(std_data)  # Now prediction should work
print("prediction:", prediction)

# *** Print the predicted probability (confidence) ***
prediction_probability = model.decision_function(std_data)


if (prediction[0] == 'NO'): # *** corrected prediction output value ***
    print('The person does not have lung cancer')
else:
    print('The person has lung cancer')