In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [80]:
data = pd.read_csv("../dataset/Brain Tumor.csv")

In [81]:
data = data.head(5000)

In [82]:
data.tail()

Unnamed: 0,Patient ID,Age,Gender,Tumor Type,Tumor Size (cm),Tumor Location,MRI Findings,WBC Count (x10^3/uL),Lymphocyte (%),PCR Test,COVID Severity,Oxygen Sat (%),Symptoms (Headache),Symptoms (Seizures),Treatment Type,Outcome
4995,P04996,37,Male,Benign,7.67,Occipital,Clear,14.08,45.7,Positive,Severe,79.73,Yes,Yes,Radiotherapy,Complications
4996,P04997,81,Female,Malignant,1.51,Frontal,Clear,12.74,25.44,Positive,,98.2,Yes,No,Radiotherapy,Deceased
4997,P04998,33,Female,Malignant,3.31,Parietal,Clear,11.51,41.84,Negative,Severe,77.54,Yes,Yes,Observation,Deceased
4998,P04999,68,Female,Malignant,7.79,Temporal,Clear,8.23,42.32,Positive,Severe,95.89,No,No,Radiotherapy,Deceased
4999,P05000,54,Male,Malignant,3.51,Parietal,Clear,6.52,32.85,Positive,Mild,95.34,No,No,Radiotherapy,Deceased


In [83]:
data.shape

(5000, 16)

In [84]:
# Drop unnecessary columns

brain = data.drop(columns=['Patient ID'])

In [85]:
brain

Unnamed: 0,Age,Gender,Tumor Type,Tumor Size (cm),Tumor Location,MRI Findings,WBC Count (x10^3/uL),Lymphocyte (%),PCR Test,COVID Severity,Oxygen Sat (%),Symptoms (Headache),Symptoms (Seizures),Treatment Type,Outcome
0,69,Female,Benign,5.01,Frontal,Present,4.32,32.53,Positive,Mild,96.67,No,No,Steroids,Deceased
1,32,Female,Malignant,3.52,Temporal,Present,5.86,39.84,Negative,Moderate,85.72,Yes,Yes,Observation,Complications
2,78,Female,Benign,8.24,Temporal,Clear,8.86,34.18,Negative,,89.67,Yes,Yes,Steroids,Recovered
3,38,Male,Benign,9.58,Parietal,Clear,6.77,16.38,Positive,Severe,89.30,Yes,Yes,Radiotherapy,Complications
4,41,Male,Malignant,1.40,Parietal,Clear,4.90,18.76,Positive,Mild,92.98,No,No,Steroids,Recovered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,37,Male,Benign,7.67,Occipital,Clear,14.08,45.70,Positive,Severe,79.73,Yes,Yes,Radiotherapy,Complications
4996,81,Female,Malignant,1.51,Frontal,Clear,12.74,25.44,Positive,,98.20,Yes,No,Radiotherapy,Deceased
4997,33,Female,Malignant,3.31,Parietal,Clear,11.51,41.84,Negative,Severe,77.54,Yes,Yes,Observation,Deceased
4998,68,Female,Malignant,7.79,Temporal,Clear,8.23,42.32,Positive,Severe,95.89,No,No,Radiotherapy,Deceased


In [86]:
brain.isna().sum()

Age                        0
Gender                     0
Tumor Type                 0
Tumor Size (cm)            0
Tumor Location             0
MRI Findings               0
WBC Count (x10^3/uL)       0
Lymphocyte (%)             0
PCR Test                   0
COVID Severity          1293
Oxygen Sat (%)             0
Symptoms (Headache)        0
Symptoms (Seizures)        0
Treatment Type             0
Outcome                    0
dtype: int64

In [87]:
# Handle missing values by filling with the mode (most common value)
brain['COVID Severity'].fillna(brain['COVID Severity'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  brain['COVID Severity'].fillna(brain['COVID Severity'].mode()[0], inplace=True)


In [88]:
brain.isna().sum()

Age                     0
Gender                  0
Tumor Type              0
Tumor Size (cm)         0
Tumor Location          0
MRI Findings            0
WBC Count (x10^3/uL)    0
Lymphocyte (%)          0
PCR Test                0
COVID Severity          0
Oxygen Sat (%)          0
Symptoms (Headache)     0
Symptoms (Seizures)     0
Treatment Type          0
Outcome                 0
dtype: int64

In [89]:
brain.describe()

Unnamed: 0,Age,Tumor Size (cm),WBC Count (x10^3/uL),Lymphocyte (%),Oxygen Sat (%)
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,51.0146,5.271336,9.429616,30.115402,87.476602
std,19.310613,2.755098,3.168393,11.573689,7.200221
min,18.0,0.5,4.01,10.01,75.0
25%,34.0,2.84,6.68,19.9675,81.1575
50%,51.0,5.32,9.4,30.355,87.54
75%,68.0,7.6025,12.16,40.12,93.6625
max,84.0,10.0,15.0,49.98,99.99


In [90]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   5000 non-null   int64  
 1   Gender                5000 non-null   object 
 2   Tumor Type            5000 non-null   object 
 3   Tumor Size (cm)       5000 non-null   float64
 4   Tumor Location        5000 non-null   object 
 5   MRI Findings          5000 non-null   object 
 6   WBC Count (x10^3/uL)  5000 non-null   float64
 7   Lymphocyte (%)        5000 non-null   float64
 8   PCR Test              5000 non-null   object 
 9   COVID Severity        5000 non-null   object 
 10  Oxygen Sat (%)        5000 non-null   float64
 11  Symptoms (Headache)   5000 non-null   object 
 12  Symptoms (Seizures)   5000 non-null   object 
 13  Treatment Type        5000 non-null   object 
 14  Outcome               5000 non-null   object 
dtypes: float64(4), int64(

In [91]:
# Encode categorical variables
categorical_columns = brain.select_dtypes(include='object').columns
label_encoders = {}

In [92]:
label_encoders = {}
for column in categorical_columns:
    label_encoder = LabelEncoder()
    brain[column] = label_encoder.fit_transform(brain[column])
    label_encoders[column] = label_encoder

In [93]:
brain.head()

Unnamed: 0,Age,Gender,Tumor Type,Tumor Size (cm),Tumor Location,MRI Findings,WBC Count (x10^3/uL),Lymphocyte (%),PCR Test,COVID Severity,Oxygen Sat (%),Symptoms (Headache),Symptoms (Seizures),Treatment Type,Outcome
0,69,0,0,5.01,0,1,4.32,32.53,1,0,96.67,0,0,3,1
1,32,0,1,3.52,3,1,5.86,39.84,0,1,85.72,1,1,1,0
2,78,0,0,8.24,3,0,8.86,34.18,0,1,89.67,1,1,3,2
3,38,1,0,9.58,2,0,6.77,16.38,1,2,89.3,1,1,2,0
4,41,1,1,1.4,2,0,4.9,18.76,1,0,92.98,0,0,3,2


In [94]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   5000 non-null   int64  
 1   Gender                5000 non-null   int32  
 2   Tumor Type            5000 non-null   int32  
 3   Tumor Size (cm)       5000 non-null   float64
 4   Tumor Location        5000 non-null   int32  
 5   MRI Findings          5000 non-null   int32  
 6   WBC Count (x10^3/uL)  5000 non-null   float64
 7   Lymphocyte (%)        5000 non-null   float64
 8   PCR Test              5000 non-null   int32  
 9   COVID Severity        5000 non-null   int32  
 10  Oxygen Sat (%)        5000 non-null   float64
 11  Symptoms (Headache)   5000 non-null   int32  
 12  Symptoms (Seizures)   5000 non-null   int32  
 13  Treatment Type        5000 non-null   int32  
 14  Outcome               5000 non-null   int32  
dtypes: float64(4), int32(

In [95]:
# Split data into features and target
X = brain.drop(columns=['Outcome'])
y = brain['Outcome']

In [96]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [97]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [98]:
# Build and train the Random Forest Classifier
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

In [99]:
# Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("-----------------------")
print(f"Accuracy: {accuracy}")
print("-----------------------")
print("Classification Report:")
print(report)

-----------------------
Accuracy: 0.316
-----------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.32      0.37      0.34       345
           1       0.32      0.29      0.30       334
           2       0.31      0.28      0.29       321

    accuracy                           0.32      1000
   macro avg       0.32      0.31      0.31      1000
weighted avg       0.32      0.32      0.31      1000



In [100]:
# Evaluate the model
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [101]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.316


In [102]:
# Example prediction
input_data = (78,0,0,8.24,3,0,8.86,34.18,0,2,89.67,1,1,3)  # Update with real values
import numpy as np
# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Using the trained classifier directly for prediction
prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print('The person does not have a brain tumor')
else:
    print('The person has a brain tumor')


[2]
The person has a brain tumor


In [103]:
import pickle

# Save model as .sav
filename = "brains.sav"
pickle.dump(model, open(filename, "wb"))
