In [None]:
# Import all the libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
import re

In [None]:
# Read the csv file

df = pd.read_csv("bacteria.csv")
print(df.head(1))

In [None]:
df.isnull().sum()                                 # Look for any empty data

In [None]:
df.shape

In [None]:
df["Harmful to Humans"].value_counts()      

In [None]:
df_clean = df.drop_duplicates(subset='Name')      # Remove any duplicate entries in the "Name" column
df_clean.shape

In [None]:
print(df_clean.columns)

In [None]:
df_clean["Harmful to Humans"] = df_clean["Harmful to Humans"].str.strip()          # Remove any spaces from the column

In [None]:
df_clean["Harmful to Humans"].value_counts()

In [None]:
print(df_clean.head(1))

In [None]:
df_encoded = pd.get_dummies(df_clean, columns=['Harmful to Humans'], drop_first=True)

# This will encode 'Yes' as 1 and 'No' as 0
df_encoded.head()

In [None]:
# Convert the yes or no entries in the column to 1 or 0

df_clean['Harmful to Humans'] = df_clean['Harmful to Humans'].map({'Yes': 1, 'No': 0})

# View the updated DataFrame
df_clean.head()

In [None]:
# The Harmful to Humans column will be removed

df_clean_drop = df_clean.drop(columns = "Harmful to Humans",axis =1)

In [None]:
# Method 1: One - Hot Encoding

In [None]:
# To convert the string entries into the numerical entries

x = pd.get_dummies(df_clean_drop, columns=['Name', 'Family', 'Where Found'])

print(x.head(1))

In [None]:
x.dtypes                            # To check the data type of the dataset

In [None]:
# The data type was found to be boolean isntead of integer. To convert it into integer

x = x.astype(int)
print(x.head(1))

In [None]:
y = df_clean['Harmful to Humans']
print(y.head(1))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel= 'linear', random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")

In [None]:
# Method 2: CounterVecrorizer

In [None]:
print(df_clean.head(1))

In [None]:
df_final = df_clean.drop(columns = "Harmful to Humans",axis = 1)
print(df_final.head(1))

In [None]:
df_final.shape

In [None]:
print(df_final.isnull().sum())

In [None]:
def custom_tokenizer(text):
    # Split by spaces while keeping commas as part of the tokens: This is needed because one the column has commas which can be directly used in the counvectorizer
    return re.findall(r'\w+|\S', text)

In [None]:
df_final['features'] = df_final['Name'] + df_final['Family'] + df_final['Where Found']

vectorizer = CountVectorizer(tokenizer=custom_tokenizer)
z = vectorizer.fit_transform(df_final['features']).toarray()

In [None]:
print(z)

In [None]:
z_train, z_test, y_train, y_test = train_test_split(z, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel= 'linear', random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [None]:
for name, model in models.items():
    model.fit(z_train, y_train)
    y_pred = model.predict(z_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")

In [None]:
# Create a predictive system

In [None]:
def bacteria(bacteria_csv):
    df_test = pd.read_csv(bacteria_csv)
    def custom_tokenizer(text):
        return re.findall(r'\w+|\S', text)
    df_test['features'] = df_test['Name'] + df_test['Family'] + df_test['Where Found']

    vectorizer = CountVectorizer(tokenizer=custom_tokenizer)
    t = vectorizer.fit_transform(df_final['features']).toarray()
    pred = model.predict(t)

    if pred[0] == 1:
        print ("This bacteria is harmful")
    else:
        print("This bacteria is not harmful")

In [None]:
bacteria_csv = "bacteria_test.csv"
bacteria(bacteria_csv)