In [None]:
# Import all required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
df = pd.read_csv("mice_protein_expression.csv")
print(df.head(1))

In [None]:
df.shape

In [None]:
df_un = df.drop_duplicates(subset = "MouseID")                                  # Drop all duplicate entries in the column "MouseID"
df_un.shape

In [None]:
df.isnull().sum().sum()                                                         # Check total missing entries

In [None]:
df.fillna(df.select_dtypes(include='number').mean(), inplace=True)              # Fill the missing entries with mean of the column

In [None]:
df.isnull().sum().sum()                                                         # Check for missing entries again

In [None]:
numerical_df = df.select_dtypes(include='number')                               # To save all numerical columns
numerical_df.shape

In [None]:

# Encode the columns to numerical

label_encoder = LabelEncoder()

df['Genotype'] = label_encoder.fit_transform(df['Genotype'])
df['Treatment'] = label_encoder.fit_transform(df['Treatment'])
df['Behavior'] = label_encoder.fit_transform(df['Behavior'])

df['class'], label_mapping = pd.factorize(df['class'])

# Create a dictionary for reference

outcome_mapping = dict(enumerate(label_mapping))

print("DataFrame with numerical outcome:\n", df)
print("\nMapping of words to numbers:\n", outcome_mapping)

In [None]:
print(df.head(1))

In [None]:
z  = df.drop(columns = "class",axis = 1)                           # Drop the "class" column
x  = z.drop(columns = "MouseID",axis = 1)                          # Drop the "MouseID" column
y = df["class"]                                                    # Save the "class" column as y

In [None]:
print(x.head(1))

In [None]:
print(y.head(1))

In [None]:
y.unique()

In [None]:
# Standardise the data

In [None]:
scaler = StandardScaler()
scaler.fit(x)

In [None]:
scaled_data = scaler.transform(x)
print(scaled_data)

In [None]:
# Reduce the dimension using t-SNE

In [None]:
tsne = TSNE(n_components=3, random_state=42)

In [None]:
x_tsne = tsne.fit_transform(scaled_data)

In [None]:
print(x_tsne)

In [None]:
# Create a scatter plot of the t-SNE results
plt.figure(figsize=(8,6))
plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c='blue', marker='o', edgecolor='k')

# Add labels and title
plt.title('t-SNE Visualization of Protein Levels in Mice')
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')

plt.show()

In [None]:
# Train, test, split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_tsne, y, test_size=0.2, random_state=42)

In [None]:
# Training the model

In [None]:
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")

In [None]:
# Create a predictive model

In [None]:
def mice(input_csv):
    read = pd.read_csv(input_csv)
    label_encoder = LabelEncoder()
    read['Genotype'] = label_encoder.fit_transform(read['Genotype'])
    read['Treatment'] = label_encoder.fit_transform(read['Treatment'])
    read['Behavior'] = label_encoder.fit_transform(read['Behavior'])
    read_drop = read_drop.drop(columns = "MouseID",axis =1)
    scaler = StandardScaler()
    scaler.fit(read_drop)
    scaled_data = scaler.transform(read_drop)
    tsne = TSNE(n_components=3, random_state=42)
    read_tsne = tsne.fit_transform(scaled_data)
    read_pred = model.predict(read_tsne)
    
    for pred in y_pred:
        print(f"Prediction:{pred},   Meaning:{outcome_mapping[pred]}")