# Software Projekt 
## 2023, Klaus Hartmann-Baruffi, Fabio Pfaehler

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 1) Load Data, Choose a Subset

Our aim is to use shallow learners, hence using the whole dataset (38161 vog groups/instances) is not feasable and we take only a subset to work with.

In [None]:
# Load data
df = pd.read_csv("/home/dinglemittens/SoftwareProject/VOGDB/vog.members.tsv",sep='\t', header=0)
print("dataset shape: {}\n".format(df.shape))

# Choose subset from vog "start" to vog "end"
start = 5
end = 19
subset = df.iloc[start-1 : end]
print(subset.iloc[:3])

## 2) Generate Feature- and Label-Vectors

Number of labels/classes (VOG groups) = size of the subset
Number of features/feature dimensions (protein IDs/sequences) = size of the subset * number of proteins per VOG * length of the proteinsequence * 20 
, where 20 reflects the number of aminoacids in a 1-hot-encoding, since we can´t feed the model with string-characters.

In [None]:
# Convert unflattened labels (#GroupName) and features (ProteinIDs) into lists
group_names = subset["#GroupName"].tolist()
protein_ids = subset["ProteinIDs"].tolist()

# Generate flattened feature(X)- and label(y)-vectors
X=[]
y=[]
for group in group_names:
    for per_group_ids in protein_ids:
        for protein_id in per_group_ids.split(","): # note: maybe change iterator names (confusing; we have the df ProteinIDs column which contains collections of protein IDs per group, so ProteinIDs contains protein ids)
            y.append(int(group.replace("VOG","")))
            X.append(protein_id)

## 3) Generate Bio-Embeddings

As we highlited in the previous step, the dimensions, - complexity of our feature space - , are extraordinary high, we need to reduce the feature size. For this purpose we will use so called protein- or bio-embeddings to ...

In [None]:
# Generate protein embeddings using the bio-embeddings module
"""Add code for the use of bio-embeddings"""

## o) Visualization of the Data

## o) Split the Data

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## o) Train a Classifier on the Training Set

In [None]:
# Define the LDA classifier
"""Add model object"""

# Ttrain the classifier (modelfitting)
"""<model>.fit(X_train, y_train)"""

## o) Prediction on the Validation Set & Accuracy

In [None]:
# Use your model to make a prediction on the test data
"""y_pred = <model>.predict(X_test)"""

# Compute accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {}".format(round(accuracy, 2)))

## o) Visualization/Plot of Decision Boundaries (?)

# Old Notebook

In [None]:
# Step 1: Import the necessary libraries
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Step 2: Load your dataset into a pandas DataFrame
df = pd.read_csv("/home/dinglemittens/SoftwareProject/VOGDB/vog.members.tsv",sep='\t', header=0)
# df = pd.read_csv("VOGDB/test.tsv",sep='\t', header=0)
print(df)

In [None]:
# Step 3: Preprocess your data
"""Next step is to pick out the relevant categories in my dataframe: The VOG numbers (labels and their 
corresponding collections of ProteinIDs (features). In addition I must convert each ID to it´s sequence by using
the fasta files. 
For the scikit split functoin I need Feature set X and label set y (with redundant labels) of same size.
By now I have my df ordered in such a way that each label has a list of proteins,
but I need the resolve them such that I have a big list of proteins each added with a label.
(Analogy: By now I have containers of balls (proteins/features), I know their label (#VOG/container), 
because they are seperated from other balls through the container. To continue I need to merge all 
the balls of all containers in a pool, before that I label them with the container number. This pool
can now be split 2 : 8 in test and training set. By stratifying (use as parameter) I can inherit the information 
of the frequency distribution of balls from a certain container relative to all balls into the two sets (If all
Ball of container 1 make up 10% of the total number of balls, then in the teset and training set will make up
10% of all balls in each of the two sets)).
Next we don´t want only our features as single strings (sequences) but as numerical vectors, where each
dimension of the vector is an amino acid. The algorithm needs numerical values for learning patterns.
The most straigt forward way would be a 1hot encoding, i.e. one feature would be a vector of vectors of 
length 20, 19 zeros and 1 one (depending on which letter is considered). We won´t do hot1 embedding but another one."""

# select interval for subset (from VOGa to VOGb) 1 - 38.161
end = df.shape[0] # last vog

a = 1
b = 180

features= df['ProteinIDs'].str.split(',').iloc[a-1:b] # each row a VOGs collection of proteins
labels = df['#GroupName'].iloc[a-1:b]

print("features:\n",features, "\n")
print("labels:\n", labels, "\n")

X=[]
y=[]
for i in range(len(features)): # for each VOG
    # id2seqvec = vog2fasta_dict(labels[i])
    for j in range(len(features.iloc[i])): # for each VOGs proteinIDs
        y.append(labels[i])
        X.append("add function here that turns ProteinID into sequence embedding")

print("X:\n",X[:8], "...\n")
print("y:\n",y[:8], "...\n")

In [None]:
# Step 4: Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("X_train:\n", X_train[:8], "...\n")
print("y_train:\n", y_train[:8], "...\n")
print("X_test:\n", X_test[:8], "...\n")
print("y_test:\n", y_test[:8], "...\n")







In [None]:
# Step 5: Choose a machine learning algorithm to use
model = LogisticRegression()

In [None]:

# Step 6: Train the model on the training data
model.fit(X_train, y_train)

In [None]:
proteinids = X_train.loc[:, 'ProteinIDs']
new_df = pd.DataFrame({'ProteinIDs': proteinids})
new_df.to_excel('./vog_proteins.xlsx', index=False)

new_df = new_df['ProteinIDs'].str.split(',', expand=True)
print(new_df)


In [None]:

# Step 7: Evaluate the model's performance on the testing data
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
# Step 8: Tune the model's hyperparameters to improve its performance
# For example, you could use GridSearchCV to search over a range of hyperparameters

In [None]:

# Step 9: Use the model to make predictions on new data
# For example, you could use model.predict(new_data) to make predictions on new data