<a href="https://colab.research.google.com/github/kunalnischal7/ML-Projects/blob/main/MLCLassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

## Importing Libraries

In [12]:
import pandas as pd

## Import Dataset from Google Drive

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
%cd "/content/drive/My Drive/"

/content/drive/My Drive


## Load Data

In [17]:
file_path = 'train.xlsx'

In [18]:
df = pd.read_excel(file_path)

# Data Preparation

## Data Shuffling

In [19]:
from sklearn.utils import shuffle
df = shuffle(df, random_state = 44)

In [20]:
df.isnull().sum()

T1        0
T2        0
T3        0
T4        0
T5        0
T6        0
T7        0
T8        0
T9        0
T10       0
T11       0
T12       0
T13       0
T14       0
T15       0
T16       0
T17       0
T18       0
target    0
dtype: int64

## Checking for Outliers

In [34]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [33]:
from scipy import stats

z_scores = stats.zscore(df[numerical_cols])

In [35]:
df_z_scores = pd.DataFrame(z_scores, columns=numerical_cols)

In [36]:
outliers = df_z_scores[(df_z_scores > threshold).any(axis=1)]

In [37]:
num_outliers = outliers.sum()

In [38]:
print("Columns with outliers and number of outliers:")
print(num_outliers[num_outliers > 0])

Columns with outliers and number of outliers:
T1      96.040082
T2     116.886613
T3     139.666204
T4       3.573452
T5     705.305743
T6     228.713384
T7      37.191751
T10    219.188652
T11    184.849486
T12    234.293476
T13    195.811849
T14     46.753109
T15    112.254727
T16     61.633584
dtype: float64


In [39]:
threshold = 3

In [40]:
df_cleaned = df.drop(outliers.index)

## Removing the outliers

In [41]:
df_cleaned.to_excel("cleaned_data.xlsx", index=False)

# Model Building

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [43]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [47]:
X = df.drop(columns=['target'])
y = df['target']

## Train Test Split

In [48]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Training the model

In [49]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [50]:
y_val_pred = rf_classifier.predict(X_val)

# Evaluation

In [53]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [54]:
cm = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[45  0  0 ...  0  0  0]
 [ 0 44  0 ...  0  0  0]
 [ 0  0 43 ...  0  0  0]
 ...
 [ 0  0  0 ... 39  0  0]
 [ 0  0  0 ...  0 36  0]
 [ 0  0  0 ...  0  0 53]]


In [51]:
accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.9865324445653653


In [52]:
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

          A1       0.98      0.92      0.95        49
         A10       0.88      0.98      0.93        45
         A11       1.00      1.00      1.00        43
         A12       1.00      1.00      1.00        39
         A13       1.00      1.00      1.00        42
         A14       1.00      0.99      0.99        89
         A15       0.96      1.00      0.98        70
         A16       1.00      1.00      1.00        56
         A17       0.93      0.98      0.95        42
         A18       1.00      1.00      1.00        43
         A19       1.00      1.00      1.00        40
          A2       1.00      1.00      1.00        38
         A20       1.00      1.00      1.00        46
         A21       0.99      1.00      0.99        72
         A22       1.00      1.00      1.00        43
         A23       1.00      1.00      1.00        43
         A24       1.00      1.00      1.00        33
         A25       0.99    