In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
data = pd.read_csv("Brain Tumor.csv")

In [8]:
data.head()

Unnamed: 0,Patient ID,Age,Gender,Tumor Type,Tumor Size (cm),Tumor Location,MRI Findings,WBC Count (x10^3/uL),Lymphocyte (%),PCR Test,COVID Severity,Oxygen Sat (%),Symptoms (Headache),Symptoms (Seizures),Treatment Type,Outcome
0,P00001,69,Female,Benign,5.01,Frontal,Present,4.32,32.53,Positive,Mild,96.67,No,No,Steroids,Deceased
1,P00002,32,Female,Malignant,3.52,Temporal,Present,5.86,39.84,Negative,Moderate,85.72,Yes,Yes,Observation,Complications
2,P00003,78,Female,Benign,8.24,Temporal,Clear,8.86,34.18,Negative,,89.67,Yes,Yes,Steroids,Recovered
3,P00004,38,Male,Benign,9.58,Parietal,Clear,6.77,16.38,Positive,Severe,89.3,Yes,Yes,Radiotherapy,Complications
4,P00005,41,Male,Malignant,1.4,Parietal,Clear,4.9,18.76,Positive,Mild,92.98,No,No,Steroids,Recovered


In [9]:
data.tail()

Unnamed: 0,Patient ID,Age,Gender,Tumor Type,Tumor Size (cm),Tumor Location,MRI Findings,WBC Count (x10^3/uL),Lymphocyte (%),PCR Test,COVID Severity,Oxygen Sat (%),Symptoms (Headache),Symptoms (Seizures),Treatment Type,Outcome
49995,P49996,65,Male,Malignant,5.37,Occipital,Present,13.91,44.58,Negative,Mild,87.53,No,No,Steroids,Complications
49996,P49997,72,Male,Benign,9.64,Parietal,Present,13.62,20.15,Negative,Moderate,91.35,Yes,No,Observation,Complications
49997,P49998,80,Female,Benign,7.68,Parietal,Clear,9.71,46.38,Negative,Mild,96.95,Yes,Yes,Steroids,Deceased
49998,P49999,23,Male,Benign,4.17,Frontal,Present,12.1,16.47,Positive,Mild,86.84,No,No,Radiotherapy,Deceased
49999,P50000,52,Male,Malignant,7.01,Frontal,Present,12.03,22.59,Negative,Moderate,94.38,Yes,No,Observation,Deceased


In [10]:
data.shape

(50000, 16)

In [12]:
# Drop unnecessary columns

brain = data.drop(columns=['Patient ID'])

In [13]:
brain

Unnamed: 0,Age,Gender,Tumor Type,Tumor Size (cm),Tumor Location,MRI Findings,WBC Count (x10^3/uL),Lymphocyte (%),PCR Test,COVID Severity,Oxygen Sat (%),Symptoms (Headache),Symptoms (Seizures),Treatment Type,Outcome
0,69,Female,Benign,5.01,Frontal,Present,4.32,32.53,Positive,Mild,96.67,No,No,Steroids,Deceased
1,32,Female,Malignant,3.52,Temporal,Present,5.86,39.84,Negative,Moderate,85.72,Yes,Yes,Observation,Complications
2,78,Female,Benign,8.24,Temporal,Clear,8.86,34.18,Negative,,89.67,Yes,Yes,Steroids,Recovered
3,38,Male,Benign,9.58,Parietal,Clear,6.77,16.38,Positive,Severe,89.30,Yes,Yes,Radiotherapy,Complications
4,41,Male,Malignant,1.40,Parietal,Clear,4.90,18.76,Positive,Mild,92.98,No,No,Steroids,Recovered
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,65,Male,Malignant,5.37,Occipital,Present,13.91,44.58,Negative,Mild,87.53,No,No,Steroids,Complications
49996,72,Male,Benign,9.64,Parietal,Present,13.62,20.15,Negative,Moderate,91.35,Yes,No,Observation,Complications
49997,80,Female,Benign,7.68,Parietal,Clear,9.71,46.38,Negative,Mild,96.95,Yes,Yes,Steroids,Deceased
49998,23,Male,Benign,4.17,Frontal,Present,12.10,16.47,Positive,Mild,86.84,No,No,Radiotherapy,Deceased


In [14]:
brain.isna().sum()

Unnamed: 0,0
Age,0
Gender,0
Tumor Type,0
Tumor Size (cm),0
Tumor Location,0
MRI Findings,0
WBC Count (x10^3/uL),0
Lymphocyte (%),0
PCR Test,0
COVID Severity,12585


In [15]:
# Handle missing values by filling with the mode (most common value)
brain['COVID Severity'].fillna(brain['COVID Severity'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  brain['COVID Severity'].fillna(brain['COVID Severity'].mode()[0], inplace=True)


In [16]:
brain.isna().sum()

Unnamed: 0,0
Age,0
Gender,0
Tumor Type,0
Tumor Size (cm),0
Tumor Location,0
MRI Findings,0
WBC Count (x10^3/uL),0
Lymphocyte (%),0
PCR Test,0
COVID Severity,0


In [17]:
brain.describe()

Unnamed: 0,Age,Tumor Size (cm),WBC Count (x10^3/uL),Lymphocyte (%),Oxygen Sat (%)
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,50.9377,5.23946,9.500999,30.021588,87.464613
std,19.373208,2.741997,3.179633,11.572717,7.216338
min,18.0,0.5,4.0,10.0,75.0
25%,34.0,2.86,6.74,19.94,81.19
50%,51.0,5.23,9.51,30.07,87.46
75%,68.0,7.59,12.25,40.04,93.71
max,84.0,10.0,15.0,50.0,100.0


In [18]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   50000 non-null  int64  
 1   Gender                50000 non-null  object 
 2   Tumor Type            50000 non-null  object 
 3   Tumor Size (cm)       50000 non-null  float64
 4   Tumor Location        50000 non-null  object 
 5   MRI Findings          50000 non-null  object 
 6   WBC Count (x10^3/uL)  50000 non-null  float64
 7   Lymphocyte (%)        50000 non-null  float64
 8   PCR Test              50000 non-null  object 
 9   COVID Severity        50000 non-null  object 
 10  Oxygen Sat (%)        50000 non-null  float64
 11  Symptoms (Headache)   50000 non-null  object 
 12  Symptoms (Seizures)   50000 non-null  object 
 13  Treatment Type        50000 non-null  object 
 14  Outcome               50000 non-null  object 
dtypes: float64(4), int6

In [19]:
# Encode categorical variables
categorical_columns = brain.select_dtypes(include='object').columns
label_encoders = {}

In [21]:
label_encoders = {}
for column in categorical_columns:
    label_encoder = LabelEncoder()
    brain[column] = label_encoder.fit_transform(brain[column])
    label_encoders[column] = label_encoder

In [26]:
brain.head()

Unnamed: 0,Age,Gender,Tumor Type,Tumor Size (cm),Tumor Location,MRI Findings,WBC Count (x10^3/uL),Lymphocyte (%),PCR Test,COVID Severity,Oxygen Sat (%),Symptoms (Headache),Symptoms (Seizures),Treatment Type,Outcome
0,69,0,0,5.01,0,1,4.32,32.53,1,0,96.67,0,0,3,1
1,32,0,1,3.52,3,1,5.86,39.84,0,1,85.72,1,1,1,0
2,78,0,0,8.24,3,0,8.86,34.18,0,2,89.67,1,1,3,2
3,38,1,0,9.58,2,0,6.77,16.38,1,2,89.3,1,1,2,0
4,41,1,1,1.4,2,0,4.9,18.76,1,0,92.98,0,0,3,2


In [22]:
brain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   50000 non-null  int64  
 1   Gender                50000 non-null  int64  
 2   Tumor Type            50000 non-null  int64  
 3   Tumor Size (cm)       50000 non-null  float64
 4   Tumor Location        50000 non-null  int64  
 5   MRI Findings          50000 non-null  int64  
 6   WBC Count (x10^3/uL)  50000 non-null  float64
 7   Lymphocyte (%)        50000 non-null  float64
 8   PCR Test              50000 non-null  int64  
 9   COVID Severity        50000 non-null  int64  
 10  Oxygen Sat (%)        50000 non-null  float64
 11  Symptoms (Headache)   50000 non-null  int64  
 12  Symptoms (Seizures)   50000 non-null  int64  
 13  Treatment Type        50000 non-null  int64  
 14  Outcome               50000 non-null  int64  
dtypes: float64(4), int6

In [23]:
# Split data into features and target
X = brain.drop(columns=['Outcome'])
y = brain['Outcome']

In [24]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [25]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [35]:
# Build and train the Random Forest Classifier
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

In [39]:
# Make predictions and evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("-----------------------")
print(f"Accuracy: {accuracy}")
print("-----------------------")
print("Classification Report:")
print(report)

-----------------------
Accuracy: 0.331
-----------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.34      0.34      3424
           1       0.33      0.35      0.34      3350
           2       0.32      0.31      0.31      3226

    accuracy                           0.33     10000
   macro avg       0.33      0.33      0.33     10000
weighted avg       0.33      0.33      0.33     10000



In [36]:
# Evaluate the model
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.3375


In [40]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.332


In [44]:
# Example prediction
input_data = (78,0,0,8.24,3,0,8.86,34.18,0,2,89.67,1,1,3)  # Update with real values

# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Using the trained classifier directly for prediction
prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
    print('The person does not have a brain tumor')
else:
    print('The person has a brain tumor')


[0]
The person does not have a brain tumor
