## Library loading

In [164]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

## Dataset loading as pandas dataframe

In [165]:
df = pd.read_csv("train.csv")

## Printing first 5 rows

In [166]:
df.head()

Unnamed: 0,ID,col_0,col_1,col_2,col_3,col_4,col_5,y
0,1,A1,0.423913,310.7,-1.302803,0.889328,1737,C3
1,2,A0,0.586957,310.1,0.021376,0.640316,1483,C1
2,3,A0,0.934783,313.3,-0.570492,0.636364,1507,C1
3,4,A1,0.402174,308.6,-0.9216,0.387352,1648,C1
4,5,A2,0.532609,311.4,-0.389922,0.557312,1506,C1


## Spliting dataset into X_train(all other features without y) & y_train(with y)

In [167]:
X_train = df.drop('y', axis=1)
y_train = df.y

# Printing first 5 rows of both X & y

In [168]:
X_train.head()

Unnamed: 0,ID,col_0,col_1,col_2,col_3,col_4,col_5
0,1,A1,0.423913,310.7,-1.302803,0.889328,1737
1,2,A0,0.586957,310.1,0.021376,0.640316,1483
2,3,A0,0.934783,313.3,-0.570492,0.636364,1507
3,4,A1,0.402174,308.6,-0.9216,0.387352,1648
4,5,A2,0.532609,311.4,-0.389922,0.557312,1506


In [169]:
y_train.head()

0    C3
1    C1
2    C1
3    C1
4    C1
Name: y, dtype: object

## Seperating columns based on categorical and numerical data

In [170]:
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64','float64']]
categorical_cols = [col for col in X_train.columns if X_train[col].dtype =='object']

#containing the column indices of the numerical columns
numerical_cols_idx = [X_train.columns.get_loc(col) for col in numerical_cols]
#containing the column indices of the categorical columns 
categorical_cols_idx = [X_train.columns.get_loc(col) for col in categorical_cols] 

## Handling missing values using transformer

In [171]:
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Merging different type of columns transformer into a processor

In [172]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_idx),
        ('cat', categorical_transformer, categorical_cols_idx)
    ], remainder='passthrough')

X_train_processed = preprocessor.fit_transform(X_train)

## Model Selection

In [173]:
model = RandomForestClassifier(random_state=42)

## Pipeline creation on model and preprocessor

In [174]:
my_classifier = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

## Finding best hyper parameters for model

In [175]:
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(my_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_processed, y_train)

## Model with best hyper parameters

In [176]:
best_classifier = grid_search.best_estimator_

## Uploading test data

In [177]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,ID,col_0,col_1,col_2,col_3,col_4,col_5
0,6001,A1,0.434783,309.0,1.044606,0.55336,1447
1,6002,A0,0.423913,310.8,-0.01875,0.699605,1502
2,6003,A1,0.48913,311.1,0.733625,0.743083,1452
3,6004,A2,0.380435,309.8,-0.409985,0.675889,1490
4,6005,A0,0.608696,310.7,-1.272708,0.300395,1696


## Processing test data for prediction

In [178]:
test_data_processed = preprocessor.fit_transform(test_data)

## Making prediction on processed test data

In [179]:
test_predictions = best_classifier.predict(test_data_processed)

## Saving prediction to CSV file

In [180]:
submission_df = pd.DataFrame({'ID': test_data['ID'], 'y': test_predictions})
submission_df.to_csv("kazi_sohrab_uddin_titu.csv", index=False)