In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier, BaggingClassifier, VotingClassifier

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Imputation

In [2]:
def imputation_full(df, numerical_columns, nominal_columns):
    for col in df.columns:
        if(col in numerical_columns):
            df[col].fillna(df[col].mean(), inplace = True)
        elif(col in nominal_columns):
            df[col] = df[col].replace(np.nan, df[col].mode()[0])
        else:
            pass
    return df

In [3]:
def imputation_class(df, numerical_columns, nominal_columns):
    
    class_values = list(df["Target (Col 107)"].unique())
    
    for col in df.columns:
        if(col in numerical_columns):
            for c in class_values:
                mean_value = df[df["Target (Col 107)"] == c][col].mean()
                df.loc[ ((df["Target (Col 107)"] == c) & (df[col].isnull())), col] = mean_value
        elif(col in nominal_columns):
            for c in class_values:
                mode_value = df[df["Target (Col 107)"] == c][col].mode()[0]
                df.loc[((df["Target (Col 107)"] == c) & (df[col].isnull())), col] = mode_value
        else:
            pass
    return df

# Standardization

In [4]:
def min_max(df, numerical_columns):
    df[numerical_columns] = df[numerical_columns].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df

In [5]:
def z_score(df, numerical_columns):
    df[numerical_columns] = df[numerical_columns].apply(lambda x: (x - x.mean()) / x.std())
    return df

# Outlier Detection

In [6]:
def isolation_forest(df, numerical_columns, nominal_columns):
    model = IsolationForest(n_estimators=1000, max_samples='auto', contamination=float(0.2), random_state = 42)
    model.fit(df[numerical_columns + nominal_columns])
    
    df["anomaly_score"] = model.predict(df[numerical_columns + nominal_columns])
    df["anomaly_score"] = df["anomaly_score"].map({1:0, -1:1})
    df = df[df["anomaly_score"] == 0]
    df = df.drop("anomaly_score", axis=1)
    return df

In [7]:
def statistical(df, numerical_columns, nominal_columns):
    df["anomaly_score"] = 0
    for col in numerical_columns:
        upper = df[col].mean() + (2*df[col].std())
        lower = df[col].mean() - (2*df[col].std())
        df.loc[((df[col] < lower) | (df[col] > upper)), "anomaly_score"] = 1

    df = df[df["anomaly_score"] == 0]
    df = df.drop("anomaly_score", axis=1)
    return df

In [8]:
def lof(df, numerical_columns, nominal_columns):
    model = LocalOutlierFactor(n_neighbors = 100, novelty = False)
    
    df["anomaly_score"] = model.fit_predict(df[numerical_columns + nominal_columns])
    df["anomaly_score"] = df["anomaly_score"].map({1:0, -1:1})
    df = df[df["anomaly_score"] == 0]
    df = df.drop("anomaly_score", axis=1)
    return df

# Feature Selection

In [9]:
def feature_columns(df):
    df_col = df.corr().loc["Target (Col 107)"].to_frame()
    df_col["Target (Col 107)"] = df_col["Target (Col 107)"].abs()
    df_col = list(df_col[df_col["Target (Col 107)"] > 0.1].index)
    return df[df_col]

In [10]:
def feature_no(df):
    return df

# Model Classifiers

In [11]:
def dtree(df):
    X = df.drop("Target (Col 107)", axis = 1)
    y = df["Target (Col 107)"]
    
    cv = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
    cv.get_n_splits(X, y)

    model = DecisionTreeClassifier(max_depth = 3, max_leaf_nodes = 5, random_state = 42)
    score = cross_val_score(model, X, y, cv = cv, scoring = "f1").mean()
    
    return score

In [12]:
def rforest(df):
    X = df.drop("Target (Col 107)", axis = 1)
    y = df["Target (Col 107)"]
    
    cv = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
    cv.get_n_splits(X, y)

    model = RandomForestClassifier(n_estimators = 165, n_jobs = -1, max_features = None, criterion = "entropy", random_state = 42)
    score = cross_val_score(model, X, y, cv = cv, scoring = "f1").mean()
    
    return score

In [13]:
def gnaivebayes(df):
    X = df.drop("Target (Col 107)", axis = 1)
    y = df["Target (Col 107)"]
    
    cv = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
    cv.get_n_splits(X, y)

    model = GaussianNB()
    score = cross_val_score(model, X, y, cv = cv, scoring = "f1").mean()
    
    return score

In [14]:
def knn(df):
    X = df.drop("Target (Col 107)", axis = 1)
    y = df["Target (Col 107)"]
    
    cv = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
    cv.get_n_splits(X, y)

    model = KNeighborsClassifier(n_neighbors = 3, metric = "manhattan")
    score = cross_val_score(model, X, y, cv = cv, scoring = "f1").mean()
    
    return score

In [15]:
def baggingensemble(df):
    X = df.drop("Target (Col 107)", axis = 1)
    y = df["Target (Col 107)"]
    
    cv = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
    cv.get_n_splits(X, y)
    
    model = BaggingClassifier(KNeighborsClassifier(n_neighbors = 3, metric = "manhattan"), max_samples=0.7, n_estimators = 100,
                              random_state = 42, oob_score = True)
    
    score = cross_val_score(model, X, y, cv = cv, scoring = "f1").mean()
    
    return score

In [16]:
def voting(df):
    X = df.drop("Target (Col 107)", axis = 1)
    y = df["Target (Col 107)"]
    
    cv = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
    cv.get_n_splits(X, y)
    
    model1 = RandomForestClassifier(n_estimators = 165, n_jobs = -1, max_features = None, criterion = "entropy", random_state = 42)
    model2 = GaussianNB()
    model3 = BaggingClassifier(KNeighborsClassifier(n_neighbors = 3, metric = "manhattan"), max_samples=0.7, n_estimators = 100,
                              random_state = 42, oob_score = True)
    
    model = VotingClassifier(estimators=[('rf', model1), ('nb', model2), ('knn', model3)], voting='hard')
    score = cross_val_score(model, X, y, cv = cv, scoring = "f1").mean()
    
    return score

# Main Function

In [17]:
#Reading Data
main_df = pd.read_csv("./Ecoli.csv")

#Defining numerical columns, nominal columns and target values
numerical_columns = [i for i in main_df.columns if "Num" in i]
nominal_columns = [i for i in main_df.columns if "Nom" in i]
class_values = list(main_df["Target (Col 107)"].unique())

# Dropping columns that have a high percentage of nulls
high_null_col_df = (main_df.isnull().sum() * 100 / len(main_df)).to_frame()
high_null_col = list(high_null_col_df[high_null_col_df[0] > 10].index)
main_df = main_df.drop(high_null_col, axis=1)

#Results as DataFrame
result_df = pd.DataFrame(columns = ["f1", "imputation_method", "standardization", "outlier_detection", "feature_selection", "Classifier"])

#Performing pre-processing
for impute in [imputation_full, imputation_class]:
    impute_df = impute(main_df.copy(), numerical_columns, nominal_columns)
    
    for standardization in [min_max, z_score]:
        standard_df = standardization(impute_df.copy(), numerical_columns)
    
        for outlier in [isolation_forest, statistical, lof]:
            outlier_df = outlier(standard_df.copy(), numerical_columns, nominal_columns)
            
            for feature in [feature_columns, feature_no]:
                feature_df = feature(outlier_df.copy())
                
                for classifier in [dtree, rforest, gnaivebayes, knn, baggingensemble, voting]:
                    score_cv = classifier(feature_df.copy())
                    result_df = result_df.append({"f1" : score_cv, 
                                                  "imputation_method" : impute.__name__, 
                                                  "standardization" : standardization.__name__, 
                                                  "outlier_detection" : outlier.__name__, 
                                                  "feature_selection": feature.__name__, 
                                                  "Classifier" : classifier.__name__}, 
                                                 ignore_index = True)

result_df.sort_values(by=["f1", "feature_selection", "standardization", "imputation_method"], 
                      ascending = [False, False, True, True], inplace = True)
result_df.reset_index(drop=True, inplace=True)

print("Top 10 classifiers methods and preprocessing techniques:")
display(result_df.head(10))

Top 10 classifiers methods and preprocessing techniques:


Unnamed: 0,f1,imputation_method,standardization,outlier_detection,feature_selection,Classifier
0,0.869337,imputation_class,min_max,isolation_forest,feature_no,knn
1,0.865223,imputation_class,min_max,isolation_forest,feature_no,baggingensemble
2,0.86274,imputation_class,min_max,lof,feature_columns,voting
3,0.858947,imputation_class,min_max,lof,feature_columns,baggingensemble
4,0.856839,imputation_class,min_max,isolation_forest,feature_no,voting
5,0.856839,imputation_class,min_max,isolation_forest,feature_columns,gnaivebayes
6,0.854131,imputation_class,z_score,isolation_forest,feature_columns,gnaivebayes
7,0.852815,imputation_class,z_score,isolation_forest,feature_no,rforest
8,0.852815,imputation_class,z_score,isolation_forest,feature_no,voting
9,0.844792,imputation_class,min_max,lof,feature_columns,dtree


In [18]:
#Make Predictions on test data, accuracy and f1 score on training data
print("Best Techniques Identified:")
result_dict = result_df.iloc[0].to_dict()
print(result_dict)

final_df = locals()[ result_dict["imputation_method"] ](main_df, numerical_columns, nominal_columns)
final_df = locals()[ result_dict["standardization"] ](final_df, numerical_columns)
final_df = locals()[ result_dict["outlier_detection"] ](final_df, numerical_columns, nominal_columns)
final_df = locals()[ result_dict["feature_selection"] ](final_df)

X = final_df.drop("Target (Col 107)", axis = 1)
y = final_df["Target (Col 107)"]

cv = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)
cv.get_n_splits(X, y)
model = KNeighborsClassifier(n_neighbors = 3, metric = "manhattan")
model.fit(X, y)

accuracy_sc = str(round(cross_val_score(model, X, y, cv = cv, scoring = "accuracy").mean(), 3))
f1_sc = str(round(cross_val_score(model, X, y, cv = cv, scoring = "f1").mean(), 3))
print("Accuracy calculated on Test Data", accuracy_sc, "\nF1 calculated on Test Data", f1_sc)

Best Techniques Identified:
{'f1': 0.8693370681605975, 'imputation_method': 'imputation_class', 'standardization': 'min_max', 'outlier_detection': 'isolation_forest', 'feature_selection': 'feature_no', 'Classifier': 'knn'}
Accuracy calculated on Test Data 0.982 
F1 calculated on Test Data 0.869


In [19]:
#Outputing Final Predictions on test data
test_file = pd.read_csv("./Ecoli_test.csv")

output_df = pd.DataFrame(columns = ["0", "1"])
output_df["0"] = model.predict(test_file)
output_df["0"] = output_df["0"].astype(str)
output_df.fillna('', inplace = True)

output_df = output_df.append(pd.DataFrame([[accuracy_sc, f1_sc]], columns = output_df.columns))
output_df.to_csv("./s4761003.csv", header = False, index = False)