# CodeAlpha Disease Prediction from Medical Data
### Task 1:Create a model to predict the likelihood of a disease based on medical data (e.g., symptoms, patient history). Use datasets with labeled medical records and apply classification algorithms.

Dataset: https://www.kaggle.com/datasets/kaushil268/disease-prediction-using-machine-learning/code

## Data Retrival

In [37]:
import pandas as pd

# Drop any unnamed columns
train_data = train_data.loc[:, ~train_data.columns.str.contains('^Unnamed')]
test_data = test_data.loc[:, ~test_data.columns.str.contains('^Unnamed')]

# Verify the columns
print(train_data.columns)
print(test_data.columns)


Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'blackheads', 'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis'],
      dtype='object', length=133)
Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'blackheads', 'scurring', 'skin_peeling', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails', 'blister',
       'red_sore_around_nose', 'yellow_crust_ooze', 'prognosis'],
      dtype='object', length=133)


## Data Preprocessing

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Separate features and target in training data
X_train = train_data.drop('prognosis', axis=1)
y_train = train_data['prognosis']

# Separate features and target in test data
X_test = test_data.drop('prognosis', axis=1)
y_test = test_data['prognosis']

# Identify categorical and numerical columns
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Create a dictionary to map symptoms to their column indices before transforming
symptom_index = {symptom: index for index, symptom in enumerate(X_train.columns)}

# Apply preprocessing to training data
X_train = preprocessor.fit_transform(X_train)
# Apply preprocessing to test data
X_test = preprocessor.transform(X_test)


## Model training

In [39]:
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.9761904761904762
Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
                            Common Cold       1.00      

## Prediction using Symptoms

In [40]:
def predict_disease(symptoms):
    # Create an input array for the model
    input_data = np.zeros(len(symptom_index))

    # Set the values for the input symptoms
    for symptom in symptoms:
        if symptom in symptom_index:
            input_data[symptom_index[symptom]] = 1

    # Convert the input data to a DataFrame
    input_df = pd.DataFrame([input_data], columns=symptom_index.keys())

    # Preprocess the input data
    input_data_transformed = preprocessor.transform(input_df)

    # Predict the disease
    prediction = model.predict(input_data_transformed)
    return prediction[0]

# Example usage
input_symptoms = ['itching', 'skin_rash', 'shivering']
predicted_disease = predict_disease(input_symptoms)
print(f'Predicted Disease: {predicted_disease}')


Predicted Disease: Allergy


### Thank You!!