## 1. Install Required Libraries

In [3]:
!pip install pandas scikit-learn xgboost




In [4]:
from sklearn.ensemble import RandomForestClassifier


## 2. Import Libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import joblib  # to save the model


## 3. Load the Lung Cancer Dataset

In [6]:
lung_df = pd.read_csv("../datasets/lung_cancer.csv")


#### Step 2: Handle Categorical Text Columns Manually

#####  Fix GENDER and any other string columns properly:

In [7]:
# Check first
print(lung_df.dtypes)

# Encode GENDER manually
lung_df['GENDER'] = lung_df['GENDER'].replace({'M':1, 'F':0})


GENDER                   object
AGE                       int64
SMOKING                   int64
YELLOW_FINGERS            int64
ANXIETY                   int64
PEER_PRESSURE             int64
CHRONIC_DISEASE           int64
FATIGUE                   int64
ALLERGY                   int64
WHEEZING                  int64
ALCOHOL_CONSUMING         int64
COUGHING                  int64
SHORTNESS_OF_BREATH       int64
SWALLOWING_DIFFICULTY     int64
CHEST_PAIN                int64
LUNG_CANCER              object
dtype: object


#### Step 3: Then Fix 1/2 to 1/0 Mapping

In [8]:
# Map 2 -> 0 (No), 1 -> 1 (Yes)
for column in lung_df.columns:
    if lung_df[column].nunique() == 2 or lung_df[column].nunique() == 3:
        lung_df[column] = lung_df[column].replace({2:0})


In [9]:
# Fix target column encoding
lung_df['LUNG_CANCER'] = lung_df['LUNG_CANCER'].replace({'YES':1, 'NO':0})


#### Step 4: Now Split Features and Target

In [10]:
X = lung_df.drop(columns=['LUNG_CANCER'])
print("Number of features:", X.shape[1])

y = lung_df['LUNG_CANCER']


Number of features: 15


#### Step 5: Scaling and Train/Test Split

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


## 5. Train Random Forest and XGBoost

In [12]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)


Parameters: { "use_label_encoder" } are not used.



## 6. Evaluate Models

In [13]:
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Report:\n", classification_report(y_test, rf_pred))

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print("XGBoost Report:\n", classification_report(y_test, xgb_pred))


Random Forest Accuracy: 0.5083333333333333
Random Forest Report:
               precision    recall  f1-score   support

           0       0.51      0.48      0.49       302
           1       0.50      0.54      0.52       298

    accuracy                           0.51       600
   macro avg       0.51      0.51      0.51       600
weighted avg       0.51      0.51      0.51       600

XGBoost Accuracy: 0.48833333333333334
XGBoost Report:
               precision    recall  f1-score   support

           0       0.49      0.45      0.47       302
           1       0.49      0.53      0.51       298

    accuracy                           0.49       600
   macro avg       0.49      0.49      0.49       600
weighted avg       0.49      0.49      0.49       600



In [14]:
import os

# ✅ Create 'models' folder if it doesn't exist
os.makedirs("models", exist_ok=True)

# ✅ Now save the model safely
joblib.dump(rf_model, "models/lung_cancer_model.pkl")
print("✅ Model saved successfully to models/lung_cancer_model.pkl!")


✅ Model saved successfully to models/lung_cancer_model.pkl!
