In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import streamlit as st
from datetime import datetime
import requests

In [2]:
data = pd.read_csv(r'C:\Users\Kruti Agrawal\Desktop\Projects\lung_cancer\Lung Cancer\dataset_med.csv')
data.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [3]:
data.info()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  890000 non-null  int64  
 1   age                 890000 non-null  float64
 2   gender              890000 non-null  object 
 3   country             890000 non-null  object 
 4   diagnosis_date      890000 non-null  object 
 5   cancer_stage        890000 non-null  object 
 6   family_history      890000 non-null  object 
 7   smoking_status      890000 non-null  object 
 8   bmi                 890000 non-null  float64
 9   cholesterol_level   890000 non-null  int64  
 10  hypertension        890000 non-null  int64  
 11  asthma              890000 non-null  int64  
 12  cirrhosis           890000 non-null  int64  
 13  other_cancer        890000 non-null  int64  
 14  treatment_type      890000 non-null  object 
 15  end_treatment_date  890000 non-nul

id                    0
age                   0
gender                0
country               0
diagnosis_date        0
cancer_stage          0
family_history        0
smoking_status        0
bmi                   0
cholesterol_level     0
hypertension          0
asthma                0
cirrhosis             0
other_cancer          0
treatment_type        0
end_treatment_date    0
survived              0
dtype: int64

In [4]:
data['diagnosis_date'] = pd.to_datetime(data['diagnosis_date'])
data['end_treatment_date'] = pd.to_datetime(data['end_treatment_date'])

In [5]:
data['treatment_duration'] = (data['end_treatment_date'] - data['diagnosis_date']).dt.days

In [6]:
data.drop(['diagnosis_date', 'end_treatment_date', 'id', 'country'], axis=1, inplace=True)

In [7]:
label_cols = ['gender', 'cancer_stage', 'family_history', 'smoking_status', 'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'treatment_type', 'survived']
le = LabelEncoder()
for col in label_cols:
    data[col] = le.fit_transform(data[col])

In [8]:
data.head()

Unnamed: 0,age,gender,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration
0,64.0,1,0,1,3,29.4,199,0,0,1,0,0,0,523
1,50.0,0,2,1,3,41.2,280,1,1,0,0,3,1,424
2,65.0,0,2,1,1,44.0,268,1,1,0,0,1,0,370
3,51.0,0,0,0,3,43.0,241,1,1,0,0,0,0,443
4,37.0,1,0,0,3,19.7,178,0,0,0,0,1,0,406


In [9]:
X = data.drop('survived', axis=1)
y = data['survived']

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [12]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7779550561797752
Classification Report:
               precision    recall  f1-score   support

           0       0.78      1.00      0.88    138639
           1       0.21      0.00      0.00     39361

    accuracy                           0.78    178000
   macro avg       0.49      0.50      0.44    178000
weighted avg       0.65      0.78      0.68    178000

Confusion Matrix:
 [[138419    220]
 [ 39304     57]]


In [15]:
import joblib
joblib.dump(model, "lung_cancer_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']