### Python Libraries to be installed:

```ssh
pip install pandas
pip install numpy
pip install imbalanced-learn
pip install scikit-learn
pip install xgboost
```

In [1]:
# Get Python Version

import platform
print("Python Version Used: ",platform.python_version())

Python Version Used:  3.8.12


In [2]:
# Import required libraries

import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, accuracy_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv("Ecoli.csv") # Read Ecoli.csv

In [None]:
sns.countplot(data=df, x = 'Target (Column 117)')
plt.ylabel('Count')
plt.title('Target class count before SMOTE')
plt.show();

In [None]:
df.isnull().mean().plot.bar(figsize=(18,8))
plt.ylabel('Percentage of missing values')
plt.xlabel('Features')
plt.title('Quantifying missing data in percentage (Before Imputation)')
plt.show();

In [None]:
# Strip Leading and Trailing Space of all the column names
df.columns = df.columns.str.strip()
df.columns

In [None]:
# Missing data imputation based on class
mask = (df['Target (Column 117)']==0)
mode = df[df['Target (Column 117)']==0].iloc[:,103:-1].mode().iloc[0]
df = df.mask(mask, df.fillna(mode))

In [None]:
mask = (df['Target (Column 117)']==1)
mode = df[df['Target (Column 117)']==1].iloc[:,103:-1].mode().iloc[0]
df = df.mask(mask, df.fillna(mode))

In [None]:
mask = (df['Target (Column 117)']==0)
median = df[df['Target (Column 117)']==0].iloc[:,0:103].median()
df = df.mask(mask, df.fillna(median))

In [None]:
mask = (df['Target (Column 117)']==1)
median = df[df['Target (Column 117)']==1].iloc[:,0:103].median()
df = df.mask(mask, df.fillna(median))

In [None]:
df.isnull().mean().plot.bar(figsize=(18,8))
plt.ylabel('Percentage of missing values')
plt.xlabel('Features')
plt.title('Quantifying missing data in percentage (After Imputation)')
plt.show();

In [None]:
# Convert float to int for nominal columns

for i in df.columns:
    if i[:5] == "(Nom)":
        df[i] = df[i].astype('int')

In [None]:
# Split into features and target

X = df.iloc[:, 0:-1]
y = df.iloc[:,-1]

In [None]:
X

In [None]:
# Use Local Outlier Factor to remove outliers

lof = LocalOutlierFactor()

In [None]:
outliers = lof.fit_predict(X) # Predict the labels (1 inlier, -1 outlier) of X according to LOF.

In [None]:
np.unique(outliers, return_counts=True)

In [None]:
outliers.shape

In [None]:
outlier_lof_index = []
for i, k in enumerate(outliers):
    if k == -1:
        outlier_lof_index.append(i)

In [None]:
len(outlier_lof_index)

In [None]:
df_lof = df.drop(outlier_lof_index)

In [None]:
df_lof.shape

In [None]:
X = df_lof.iloc[:, 0:-1].reset_index()
y = df_lof.iloc[:,-1].reset_index().drop('index', axis=1)['Target (Column 117)']

In [None]:
smote = SMOTE(sampling_strategy='minority', random_state = 42)
X, y = smote.fit_resample(X, y)

y.value_counts()

In [None]:
sns.countplot(x = y)
plt.ylabel('Count')
plt.title('Target class count after SMOTE')
plt.show();

In [None]:
scaler = StandardScaler() # Initiate Standardization

## Final Voting Classifier

In [None]:
kf = KFold(n_splits=10, shuffle = True, random_state=42) # Using KFold Cross Validation
cv_f1_score_clf = []
cv_acc_score_clf = []
i=1

for train_index,test_index in kf.split(X,y):
    print('{} of KFold {}'.format(i,kf.n_splits))
    xtr,xvl = X.loc[train_index],X.loc[test_index]
    ytr,yvl = y.loc[train_index],y.loc[test_index]

    xtr = xtr.drop('index', axis=1)
    xvl = xvl.drop('index', axis=1)    

    # Standardization
    xtr.iloc[:,0:103] = scaler.fit_transform(xtr.iloc[:,0:103])
    xvl.iloc[:,0:103] = scaler.transform(xvl.iloc[:,0:103])

    # Initiating ML models
    clf1 = LogisticRegression(C = 0.615848211066026, penalty = 'l2', random_state=42)
    clf2 = RandomForestClassifier(criterion= 'entropy', max_depth = 14, max_features = 55, min_samples_leaf = 9, n_estimators = 125, random_state = 42)
    clf3 = SVC(C = 100, gamma = 0.01, kernel = 'rbf', random_state = 42)
    clf4 = XGBClassifier(eval_metric='mlogloss', learning_rate = 0.1, max_depth = 5, n_estimators = 140, use_label_encoder=False, random_state  = 42)
    clf5 = GaussianNB(var_smoothing = 0.01873817422860384)

    # Voting Classifier
    v_clf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svm', clf3), ('xgb', clf4), ('nb', clf5)], voting='hard')
    v_clf = v_clf.fit(xtr,ytr)
    score_f1_clf = f1_score(yvl, v_clf.predict(xvl))
    score_acc_clf = accuracy_score(yvl, v_clf.predict(xvl))
    cv_f1_score_clf.append(score_f1_clf)
    cv_acc_score_clf.append(score_acc_clf)
    i+=1

In [None]:
print('Average CV F1 score = ', round(sum(cv_f1_score_clf)/ len(cv_f1_score_clf), 3))
print('Average CV accuracy = ', round(sum(cv_acc_score_clf)/ len(cv_acc_score_clf), 3))

In [None]:
X_test = pd.read_csv("Ecoli_test.csv") # Read Ecoli_test.csv

In [None]:
y_pred = v_clf.predict(X_test)

In [None]:
y_pred.shape

In [None]:
sns.countplot(x = y_pred)
plt.ylabel('Count')
plt.title('Predicted class Distribution')
plt.show();

In [None]:
np.unique(y_pred, return_counts=True)

In [None]:
np.savetxt('s4655782.csv', y_pred, delimiter=",", fmt='%f')

In [3]:
df_x = pd.read_csv("s4655782.csv", header=None)

In [None]:
sns.countplot(x = df_x.iloc[:,0])
plt.ylabel('Count')
plt.title('Predicted class Distribution')
plt.show();

In [None]:
df_x.iloc[:,0].value_counts()

In [None]:
df_x_std = pd.read_csv("s4655782_new.csv", header=None)

In [None]:
sns.countplot(x = df_x_std.iloc[:,0])
plt.ylabel('Count')
plt.title('Predicted class Distribution')
plt.show();

In [None]:
df_x_std.iloc[:,0].value_counts()