In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

# Ignore warnings for cleaner output
warnings.filterwarnings('ignore')

In [None]:
# Upload your dataset
from google.colab import files
uploaded = files.upload()

Saving Training.csv to Training.csv


In [None]:
file_name = list(uploaded.keys())[0]

In [None]:
# Read the dataset into a pandas DataFrame
df = pd.read_csv(file_name)


In [None]:
# Display the first 5 rows to verify it loaded correctly
print("Dataset Head:")
print(df.head())

Dataset Head:
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   skin_peeling  silver_

In [None]:
# Get a summary of the dataset (data types, non-null values)
print("Dataset Info:")
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Columns: 134 entries, itching to Unnamed: 133
dtypes: float64(1), int64(132), object(1)
memory usage: 5.0+ MB


In [None]:
print("\nMissing values before cleaning:")
print(df.isnull().sum())



Missing values before cleaning:
itching                    0
skin_rash                  0
nodal_skin_eruptions       0
continuous_sneezing        0
shivering                  0
                        ... 
blister                    0
red_sore_around_nose       0
yellow_crust_ooze          0
prognosis                  0
Unnamed: 133            4920
Length: 134, dtype: int64


In [None]:
# --- Data Cleaning Strategy ---
# Fill missing numerical values with the mean
for col in df.select_dtypes(include=np.number).columns:
    df[col].fillna(df[col].mean(), inplace=True)

In [None]:
# Fill missing categorical values with the mode (most frequent value)
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("\nMissing values after cleaning:")
print(df.isnull().sum())


Missing values after cleaning:
itching                    0
skin_rash                  0
nodal_skin_eruptions       0
continuous_sneezing        0
shivering                  0
                        ... 
blister                    0
red_sore_around_nose       0
yellow_crust_ooze          0
prognosis                  0
Unnamed: 133            4920
Length: 134, dtype: int64


In [None]:
# --- Preprocessing ---
# IMPORTANT: Replace 'target_column' with the actual name of your target variable column
target_column = 'target_column' # <--- CHANGE THIS



In [None]:
# THIS IS CORRECT (assuming your target column is named 'diagnosis')
target_column = 'diagnosis' # <--- I CHANGED THIS

In [None]:
X = df.drop(target_column, axis=1)
y = df[target_column]

# One-Hot Encode categorical features in X
# This converts categorical columns into multiple binary (0/1) columns
X = pd.get_dummies(X, drop_first=True)

print("\nFeatures (X) Head after preprocessing:")
print(X.head())

print(f"\nTarget (y) variable: '{target_column}'")

KeyError: "['diagnosis'] not found in axis"

In [None]:
# Print all column names from your DataFrame
# Look for the column you want to predict in this list
print("Available columns in the dataset:")
print(df.columns.tolist())

Available columns in the dataset:
['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing', 'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity', 'ulcers_on_tongue', 'muscle_wasting', 'vomiting', 'burning_micturition', 'spotting_ urination', 'fatigue', 'weight_gain', 'anxiety', 'cold_hands_and_feets', 'mood_swings', 'weight_loss', 'restlessness', 'lethargy', 'patches_in_throat', 'irregular_sugar_level', 'cough', 'high_fever', 'sunken_eyes', 'breathlessness', 'sweating', 'dehydration', 'indigestion', 'headache', 'yellowish_skin', 'dark_urine', 'nausea', 'loss_of_appetite', 'pain_behind_the_eyes', 'back_pain', 'constipation', 'abdominal_pain', 'diarrhoea', 'mild_fever', 'yellow_urine', 'yellowing_of_eyes', 'acute_liver_failure', 'fluid_overload', 'swelling_of_stomach', 'swelled_lymph_nodes', 'malaise', 'blurred_and_distorted_vision', 'phlegm', 'throat_irritation', 'redness_of_eyes', 'sinus_pressure', 'runny_nose', 'congestion', 'chest_pain', 'weakness_in_limbs', '

In [None]:
 --- Corrected Preprocessing Cell ---

# IMPORTANT: Replace 'diagnosis' below with the correct target column name you found above.
target_column = 'diagnosis' # <--- CHANGE THIS TO THE CORRECT NAME

# --- The rest of the code remains the same ---

In [None]:
# Drop the extra, unnecessary column if it exists
if 'Unnamed: 133' in df.columns:
    df = df.drop('Unnamed: 133', axis=1)
    print("Dropped 'Unnamed: 133' column.")

# --- The rest of the preprocessing code ---
# Set the correct target column name
target_column = 'prognosis'

# Separate features (X) and the target variable (y)
X = df.drop(target_column, axis=1)
y = df[target_column]

# One-Hot Encode categorical features in X.
# In this specific dataset, all feature columns are already numerical (0/1),
# so get_dummies won't change anything, but it's good practice to keep.
X = pd.get_dummies(X, drop_first=True)

print(f"\n Target column set to: '{target_column}'")
print("\nFeatures (X) Head after preprocessing:")
print(X.head())

Dropped 'Unnamed: 133' column.

 Target column set to: 'prognosis'

Features (X) Head after preprocessing:
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0 

In [None]:
# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of training features (X_train): {X_train.shape}")
print(f"Shape of testing features (X_test): {X_test.shape}")
print(f"Shape of training target (y_train): {y_train.shape}")
print(f"Shape of testing target (y_test): {y_test.shape}")

Shape of training features (X_train): (3936, 132)
Shape of testing features (X_test): (984, 132)
Shape of training target (y_train): (3936,)
Shape of testing target (y_test): (984,)


In [None]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
print("Training the Random Forest model...")
rf_model.fit(X_train, y_train)
print("Model training complete!")

Training the Random Forest model...
Model training complete!


In [None]:
# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# --- Performance Evaluation ---

# 1. Calculate the Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f" Model Accuracy: {accuracy:.4f} or {accuracy*100:.2f}%\n")


# 2. Display the Confusion Matrix
print(" Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\n")


# 3. Display the Classification Report
print(" Classification Report:")
report = classification_report(y_test, y_pred)
print(report)

 Model Accuracy: 1.0000 or 100.00%

 Confusion Matrix:
[[18  0  0 ...  0  0  0]
 [ 0 30  0 ...  0  0  0]
 [ 0  0 24 ...  0  0  0]
 ...
 [ 0  0  0 ... 26  0  0]
 [ 0  0  0 ...  0 22  0]
 [ 0  0  0 ...  0  0 34]]


 Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chick

In [None]:
# Import the necessary functions for cross-validation
from sklearn.model_selection import KFold, cross_val_score

# 1. Initialize the same Random Forest model
# We use the same parameters for a fair comparison
rf_model_cv = RandomForestClassifier(n_estimators=100, random_state=42)

# 2. Set up the K-Fold cross-validator
# n_splits=10 means we will create 10 folds
# shuffle=True is important to randomize the data before splitting
k_folds = KFold(n_splits=10, shuffle=True, random_state=42)

# 3. Perform cross-validation
# This runs the entire process: splitting, training, and scoring 10 times
# 'scoring='accuracy'' tells it which metric to calculate
scores = cross_val_score(rf_model_cv, X, y, cv=k_folds, scoring='accuracy')

# 4. Display the results
print("--- K-Fold Cross-Validation Results ---")
print(f"Scores for each of the 10 folds: {np.round(scores, 4)}")
print("\n")
print(f"Average Accuracy (Mean): {scores.mean():.4f} or {scores.mean()*100:.2f}%")
print(f" Standard Deviation: {scores.std():.4f}")

--- K-Fold Cross-Validation Results ---
Scores for each of the 10 folds: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


Average Accuracy (Mean): 1.0000 or 100.00%
 Standard Deviation: 0.0000
