In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

In [3]:
# Set current directory
curr_dir = os.getcwd()

# Load the dataset
f_path = os.path.join(curr_dir, "cancer patient data sets.csv")
data = pd.read_csv(f_path)
data.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [4]:
# Selecting informative features and the target variable
features = [
    'Age', 'Alcohol use', 'Smoking', 'chronic Lung Disease', 'Fatigue',
    'Coughing of Blood', 'Chest Pain', 'Genetic Risk', 'OccuPational Hazards'
]
target = 'Level'

X = data[features]
y = data[target]

In [5]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# Training the Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [7]:
# Predicting on the test set
y_pred = rf_model.predict(X_test)

In [8]:
# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 1.0


In [9]:
# Classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        High       1.00      1.00      1.00       119
         Low       1.00      1.00      1.00        84
      Medium       1.00      1.00      1.00        97

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [10]:
# Displaying the importance of each feature
feature_importance = pd.DataFrame(rf_model.feature_importances_, index=X_train.columns, columns=['importance'])
feature_importance = feature_importance.sort_values('importance', ascending=False)
print(feature_importance)


                      importance
Coughing of Blood       0.223032
Fatigue                 0.180937
Alcohol use             0.180004
Smoking                 0.106766
Genetic Risk            0.093406
OccuPational Hazards    0.088529
Chest Pain              0.062141
chronic Lung Disease    0.033863
Age                     0.031322
