In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Load data according your file
data = pd.read_csv('categorical_data.csv')


In [None]:
print(data.head())

In [None]:
print(data.info())

In [None]:
print(data.describe())

In [None]:
null_values = data.isnull().sum()
print("Null Values:\n", null_values)

In [None]:
duplicate_rows = data[data.duplicated(keep='first')]
print("Duplicate Rows:\n", duplicate_rows)

#### Data Exploration using graphs

In [None]:
sns.countplot(data['target'])
plt.title('Distribution of Target Variable')
plt.show()

# This step is completly upon you to decide

In [None]:
label_encoder = LabelEncoder()
X_encoded = X.apply(label_encoder.fit_transform)

In [None]:
scaler = StandardScaler()
data['numeric_feature'] = scaler.fit_transform(data['numeric_feature'])

In [None]:
X = data.drop(columns=['target'])
Y = data['target']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}


In [None]:
for name, model in models.items():
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    classification_rep = classification_report(Y_test, Y_pred)
    confusion_mat = confusion_matrix(Y_test, Y_pred)
    
    print(f'Model: {name}')
    print(f'Accuracy: {accuracy}')
    print(f'Classification Report:\n{classification_rep}')
    print(f'Confusion Matrix:\n{confusion_mat}')
    print('\n')
