# Predicting a Categorical Feature in Machine Learning

This notebook guides you through the steps to predict a categorical feature using machine learning. We will cover data preprocessing, model training, evaluation, and hyperparameter tuning.

In [None]:
# Step 1: Understand the Problem and Collect Data
# For demonstration, let's use the Iris dataset from sklearn

from sklearn.datasets import load_iris
import pandas as pd

# Load dataset
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Display the first few rows
df.head()


# Step 2: Data Preprocessing
## Handling Missing Values
# For simplicity, the Iris dataset does not contain missing values.


## Encoding Categorical Features
# In the Iris dataset, the target variable is already numerical, so no encoding is needed here.


In [None]:
## Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[data.feature_names] = scaler.fit_transform(df[data.feature_names])

# Display the scaled features
df.head()


# Step 3: Exploratory Data Analysis (EDA)
## Analyze Data Distribution
import matplotlib.pyplot as plt
import seaborn as sns

# Pairplot to visualize relationships
sns.pairplot(df, hue='target')
plt.show()


## Detect and Handle Outliers
# Box plots to visualize outliers
for column in data.feature_names:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[column])
    plt.title(f'Box Plot of {column}')
    plt.show()


# Step 4: Feature Selection/Engineering
# For simplicity, we'll use all features from the Iris dataset.


In [None]:
# Step 5: Split Data into Training and Test Sets
from sklearn.model_selection import train_test_split

X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Step 6: Choose a Classification Algorithm
# We'll start with a RandomForestClassifier


In [None]:
# Step 7: Train the Model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)


# Step 8: Evaluate the Model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


# Step 9: Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20, None]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

# Step 10: Validate the Model

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

# Step 11: Deployment and Monitoring
## This step involves deploying the model to a production environment and monitoring its performance.
## This is typically done outside the scope of a Jupyter Notebook.
