<h4> Zadanie1 (2pkt, RNN/LSTM vs klasyczna reprezentacja)

- Rozważ plik all_chem_df.csv, zawierający pewne leki (w formacie SMILES) i informację o obszarze działania. Wybierz 4 najczęstsze targety. W oparciu o reprezentację One-hot-encoding oraz sieci rekurencyjne zbuduj klasyfikator. Wydziel zbiór treningowy i testowy (a być może także walidacyjny).

Cześć I

In [5]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import numpy as np
from sklearn.model_selection import train_test_split

In [4]:
import pandas as pd

df = pd.read_csv("all_chem_df.csv", sep = ",")

from collections import Counter
print(Counter(df["tags"]).most_common(4))

df2 = df[df['tags'].isin(['antiinfective', 'antineoplastic', 'cns','cardio'])]

X = list(df2["smiles"])
y = list(df2["tags"])
# zlaczone = "".join(list(df2["smiles"]))

print(X[:5])
print(y[:5])

[('antiinfective', 2412), ('antineoplastic', 1175), ('cns', 1149), ('cardio', 797)]
['CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(NS(=O)(=O)c3ccc(C(F)(F)F)cn3)c2)C(=O)O1', 'CCCCC(C)C(=O)OC1C(C)C(CC)OC2(CC3CC(C/C=C(\\C)CC(C)/C=C/C=C4\\COC5C(O)C(C)=CC(C(=O)O3)C45O)O2)C1O', 'COc1cc2c(c(OC)c1OC)-c1c(cc3c(c1OC)OCO3)C[C@H](C)[C@@](C)(O)C2', 'CC(=O)OCC(=O)C1CCC2C3CCC4CC(O)CCC4(C)C3C(=O)CC12C', 'CC(=O)Nc1nnc(S(N)(=O)=O)s1']
['antiinfective', 'antiinfective', 'antineoplastic', 'cns', 'cns']


In [5]:
all_letters = sorted(list(set("".join(X))))
char_to_idx = {ch: i for i, ch in enumerate(all_letters)} #enumerate(all_letters) daje pary (index, char)
n_letters = len(all_letters)

def text_to_onehot(text):
    seq = torch.zeros(len(text), n_letters) #(rows, cols)
    for i, ch in enumerate(text):
        seq[i, char_to_idx[ch]] = 1.0
    return seq

In [6]:
class ListDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        return text_to_onehot(self.data[idx]), torch.tensor(self.targets[idx], dtype=torch.long)

#do obslugi sekwencji o roznej dlugosci
def collate_fn(batch):
    # batch = [(seq1, label1), (seq2, label2), ...]
    sequences, labels = zip(*batch) # rozpakowujemy na dwie listy
    lengths = torch.tensor([len(seq) for seq in sequences]) #wymiar bez paddingu
    padded = pad_sequence(sequences, batch_first=True)  #wymiar: (batch, max_len, vocab_size)
    labels = torch.stack(labels)
    return padded, labels, lengths

In [7]:
indices = np.arange(len(X)) #dzielimy na podstawie indeksów zeby przyśpieszyć podział i stratyfikować
train_ind, test_ind = train_test_split(indices, test_size=0.3, random_state=42, stratify=y) #stratify=y - podział z zachowaniem proporcji klas w y

#zmiana etykiet na numery bo Pytorch wymaga etykiet numerycznych
uni_classes = sorted(list(set(y)))
class_to_idx = {c: i for i, c in enumerate(uni_classes)}

y_idx = [class_to_idx[c] for c in y]

train_targets = [y_idx[i] for i in train_ind]
test_targets = [y_idx[i] for i in test_ind]


In [8]:
uni, counts = np.unique(train_targets, return_counts=True)
weight_per_class = {u: len(train_targets)/c for u, c in zip(uni, counts)}
weights = [weight_per_class[c] for c in train_targets]
weights = torch.DoubleTensor(weights)  # wymagane przez WeightedRandomSampler, bez tego miałam błąd
sampler = WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)

train_dataset = ListDataset([X[i] for i in train_ind], train_targets)
test_dataset = ListDataset([X[i] for i in test_ind], test_targets)

train_loader = DataLoader(train_dataset, batch_size=4, sampler=sampler, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

In [9]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, hidden = self.rnn(packed)
        out = self.fc(hidden.squeeze(0))
        return out
#budujemy model
num_classes = len(set(y))
model = RNNClassifier(input_size=n_letters, hidden_size=16, num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

#Trening
for epoch in range(4):
    model.train()
    total_loss = 0
    for X_batch, y_batch, lengths in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

def evaluate(model, loader, topk=(1,3)):
    model.eval()
    correct = {k: 0 for k in topk}
    total = 0
    with torch.no_grad():
        for X_batch, y_batch, lengths in loader:
            outputs = model(X_batch, lengths)
            total += y_batch.size(0)
            for k in topk:
                _, pred = outputs.topk(k, dim=1)
                correct[k] += (pred == y_batch.view(-1, 1)).any(dim=1).sum().item()

    for k in topk:
        acc = 100 * correct[k] / total
        print(f"Top-{k} accuracy: {acc:.2f}%")

evaluate(model, test_loader)

Epoch 1, Loss: 1.3703
Epoch 2, Loss: 1.2926
Epoch 3, Loss: 1.2604
Epoch 4, Loss: 1.3086
Top-1 accuracy: 32.05%
Top-3 accuracy: 77.17%


Część II

- W pliku drugs_prop.txt występują te same leki, ale tym razem podano ich wybrane własności fizykochemiczne. Zbuduj klasyfikator w oparciu o regresję logistyczną lub/oraz SVM. Wydziel zbiór treningowy i testowy.
- Porównaj efektywność zbudowanych modeli. Skomentuj otrzymane wyniki. Wskaż plusy i minusy obydwu rozwiązań.

In [10]:
df_drugs = pd.read_csv("drugs_prop.txt", sep = ',')
print(df_drugs.columns)

y = df_drugs[' target']
X = df_drugs.drop(columns=[' target'])

print(X.head())
print(y.head())

Index(['mass', ' logp', ' h_d', ' h_a', ' rot_b', ' tpsa', ' target'], dtype='object')
     mass   logp   h_d   h_a   rot_b    tpsa
0  602.68   7.33     2     6      11  105.59
1  686.88   5.24     3    10       6  140.98
2  416.47   3.60     1     7       4   75.61
3  390.52   3.32     1     5       3   80.67
4  222.25  -0.86     2     6       2  115.04
0      antiinfective
1      antiinfective
2     antineoplastic
3                cns
4                cns
Name:  target, dtype: object


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train) #parametry standaryzacji wyznaczane są na zbiorze treningowym
X_test_std = stdsc.transform(X_test)

lr = LogisticRegression(solver='lbfgs', class_weight='balanced')
lr.fit(X_train_std, y_train)

print(classification_report(y_test, lr.predict(X_test_std)))

                 precision    recall  f1-score   support

  antiinfective       0.71      0.48      0.57       739
 antineoplastic       0.29      0.23      0.26       340
         cardio       0.33      0.36      0.35       247
            cns       0.40      0.75      0.52       334

       accuracy                           0.47      1660
      macro avg       0.44      0.46      0.43      1660
   weighted avg       0.51      0.47      0.47      1660



In [12]:
#top 3 accuracy
probs = lr.predict_proba(X_test_std)  #shape: (n_samples, n_classes)
top3_preds = np.argsort(probs, axis=1)[:, -3:]  #shape: (n_samples, 3)
top3_correct = sum(y_test.iloc[i] in lr.classes_[top3_preds[i]] for i in range(len(y_test)))
top3_accuracy = top3_correct / len(y_test)
print(f"Top-3 accuracy: {top3_accuracy:.2%}")

Top-3 accuracy: 85.90%


Regresja logistyczna wyszła lepiej w obu dokładnościach (top1, top3). Oznacza to, że nie zawsze warto korzystać z bardziej skomplikowanego modelu.

SVM

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# SVM
svm = SVC(kernel='linear', C=100, random_state=42, class_weight='balanced', probability=True)
svm.fit(X_train_std, y_train)            #dopasowanie modelu do danych

#dokładność na zbiorze testowym
accuracy = svm.score(X_test_std, y_test)
print(f"Test accuracy: {accuracy:.2%}")

Test accuracy: 45.42%


In [23]:
#przewidywane prawdopodobieństwa
probs = svm.predict_proba(X_test_std)# shape: (n_samples, n_classes)

#top-3 predykcje
top3_preds = np.argsort(probs, axis=1)[:, -3:]  #ostatnie 3 klasy z największym prawdopodobieństwem

y_test_np = y_test.to_numpy()

#zamiana y_test na indeksy w svm.classes_
y_test_idx = np.array([np.where(svm.classes_ == y)[0][0] for y in y_test_np])

# Top-3 accuracy
top3_correct = np.any(top3_preds == y_test_idx[:, None], axis=1)
top3_accuracy = top3_correct.mean() * 100

print(f"SVM Top-3 accuracy: {top3_accuracy:.2f}%")


SVM Top-3 accuracy: 89.52%


| Model | Top-1 Accuracy | Top-3 Accuracy |
|-------|----------------|----------------|
| RNN   | 32.05%         | 77.17%         |
| Logistic Regression (LR) | 47%          | 85.90%         |
| SVM   | 45.42%         | 89.52%         |


Wbrew intuicji, najbardziej skomplikowany model (RNN) nie dał najleszych wyników. Wskazuje to na to, że właściwosci fizykochemiczne dają lepsze przewidywania, niż stuktura leku w formacie smiles, LR i SVM dały porównywalne wyniki.

<h4> Zadanie2 (Jak zakodować aminokwasy? Embeddingi)
    
- Rozważ dane dotyczącece lokalizacji komórkowej wybranych białek (peptydów) - te same co w ćw2 (z regresji logistycznej). Tym razem, zbuduj model w oparciu o sieci rekurencyjne i zaproponowaną przez siebie reprezentację aminokwasów - tzw. embeddingi. Potestuj różne topologie sieci oraz reprezentacje dla danych. Skomentuj otrzymane wyniki

Dane

In [3]:
import pandas as pd
import numpy as np
df = pd.read_csv('swissprot_annotated_proteins.tab', sep = "\t", header = None)

from Bio import SeqIO

plik_fasta = "targetp.fasta"

ids, sqs = [], []

for rekord in SeqIO.parse(plik_fasta, "fasta"):
    ids.append(str(rekord.id))
    sqs.append(str(rekord.seq))

labs_df = pd.DataFrame(columns=['location', 'sequence'])

for id_val, seq_val in zip(ids, sqs):
    #filtracja df po id
    filtered = df[df[0] == id_val]
    if not filtered.empty: 
        loc = filtered[1].values[0]
        if loc != 'Other':
            # +nowy wiersz do df
            labs_df = pd.concat([labs_df, pd.DataFrame({'location': [loc], 'sequence': [seq_val]})], ignore_index=True)

#liczba sekwencji dla każdej lokalizacji
print(labs_df.groupby('location')['sequence'].count())

X = list(labs_df['sequence'])
y = list(labs_df['location'])

location
CH     227
MT     499
SP    2697
TH      45
Name: sequence, dtype: int64


Podstawowy model RNN - one hot encoding 


In [6]:
all_letters = sorted(list(set("".join(X))))
char_to_idx = {ch: i for i, ch in enumerate(all_letters)} #enumerate(all_letters) daje pary (index, char)
n_letters = len(all_letters)

def text_to_onehot(text):
    seq = torch.zeros(len(text), n_letters) #(rows, cols)
    for i, ch in enumerate(text):
        seq[i, char_to_idx[ch]] = 1.0
    return seq

class ListDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        return text_to_onehot(self.data[idx]), torch.tensor(self.targets[idx], dtype=torch.long)

#do obslugi sekwencji o roznej dlugosci
def collate_fn(batch):
    # batch = [(seq1, label1), (seq2, label2), ...]
    sequences, labels = zip(*batch) # rozpakowujemy na dwie listy
    lengths = torch.tensor([len(seq) for seq in sequences]) #wymiar bez paddingu
    padded = pad_sequence(sequences, batch_first=True)  #wymiar: (batch, max_len, vocab_size)
    labels = torch.stack(labels)
    return padded, labels, lengths

indices = np.arange(len(X)) #dzielimy na podstawie indeksów zeby przyśpieszyć podział i stratyfikować
train_ind, test_ind = train_test_split(indices, test_size=0.3, random_state=42, stratify=y) #stratify=y - podział z zachowaniem proporcji klas w y

#zmiana etykiet na numery bo Pytorch wymaga etykiet numerycznych
uni_classes = sorted(list(set(y)))
class_to_idx = {c: i for i, c in enumerate(uni_classes)}

y_idx = [class_to_idx[c] for c in y]

train_targets = [y_idx[i] for i in train_ind]
test_targets = [y_idx[i] for i in test_ind]


uni, counts = np.unique(train_targets, return_counts=True)
weight_per_class = {u: len(train_targets)/c for u, c in zip(uni, counts)}
weights = [weight_per_class[c] for c in train_targets]
weights = torch.DoubleTensor(weights)  # wymagane przez WeightedRandomSampler, bez tego miałam błąd
sampler = WeightedRandomSampler(weights=weights, num_samples=len(weights), replacement=True)

train_dataset = ListDataset([X[i] for i in train_ind], train_targets)
test_dataset = ListDataset([X[i] for i in test_ind], test_targets)

train_loader = DataLoader(train_dataset, batch_size=4, sampler=sampler, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, hidden = self.rnn(packed)
        out = self.fc(hidden.squeeze(0))
        return out
    
#budujemy model
num_classes = len(set(y))
model = RNNClassifier(input_size=n_letters, hidden_size=16, num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

#Trening
for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch, lengths in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch, lengths in loader:
            outputs = model(X_batch, lengths)
            _, pred = torch.max(outputs, dim=1)  # wybiera klasę z najwyższym prawdopodobieństwem
            correct += (pred == y_batch).sum().item()
            total += y_batch.size(0)

    acc = 100 * correct / total
    print(f"Accuracy: {acc:.2f}%")

evaluate(model, test_loader)

Epoch 1, Loss: 1.1721
Epoch 2, Loss: 1.0086
Epoch 3, Loss: 0.9217
Epoch 4, Loss: 0.9750
Epoch 5, Loss: 0.8767
Accuracy: 38.90%


2. Model: class weight zamiast sampler


In [7]:
all_letters = sorted(list(set("".join(X))))
char_to_idx = {ch: i for i, ch in enumerate(all_letters)}
n_letters = len(all_letters)

def text_to_onehot(text):
    seq = torch.zeros(len(text), n_letters)
    for i, ch in enumerate(text):
        seq[i, char_to_idx[ch]] = 1.0
    return seq

class ListDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        return text_to_onehot(self.data[idx]), torch.tensor(self.targets[idx], dtype=torch.long)

def collate_fn(batch):
    sequences, labels = zip(*batch)
    lengths = torch.tensor([len(seq) for seq in sequences])
    padded = pad_sequence(sequences, batch_first=True)
    labels = torch.stack(labels)
    return padded, labels, lengths

indices = np.arange(len(X))
train_ind, test_ind = train_test_split(indices, test_size=0.3, random_state=42, stratify=y)

uni_classes = sorted(list(set(y)))
class_to_idx = {c: i for i, c in enumerate(uni_classes)}
y_idx = [class_to_idx[c] for c in y]

train_targets = [y_idx[i] for i in train_ind]
test_targets = [y_idx[i] for i in test_ind]

# class weights zamiast sampler
class_counts = np.bincount(train_targets)
weights = 1.0 / class_counts
weights = torch.tensor(weights, dtype=torch.float)

train_dataset = ListDataset([X[i] for i in train_ind], train_targets)
test_dataset = ListDataset([X[i] for i in test_ind], test_targets)

# sampler usunięty — zwykłe shuffle=True
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, hidden = self.rnn(packed)
        out = self.fc(hidden.squeeze(0))
        return out

num_classes = len(set(y))
model = RNNClassifier(input_size=n_letters, hidden_size=16, num_classes=num_classes)

#dodanie class weights
criterion = nn.CrossEntropyLoss(weight=weights)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch, lengths in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch, lengths in loader:
            outputs = model(X_batch, lengths)
            _, pred = torch.max(outputs, dim=1)
            correct += (pred == y_batch).sum().item()
            total += y_batch.size(0)
    acc = 100 * correct / total
    print(f"Accuracy: {acc:.2f}%")

evaluate(model, test_loader)

Epoch 1, Loss: 1.1752
Epoch 2, Loss: 1.2070
Epoch 3, Loss: 1.1564
Epoch 4, Loss: 1.1369
Epoch 5, Loss: 1.1295
Accuracy: 54.56%


LSTM

In [8]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=1, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):
        packed = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, (hidden, cell) = self.lstm(packed)
        out = self.dropout(hidden[-1])  # ostatnia warstwa, ostatni stan
        out = self.fc(out)
        return out

#Budujemy model
num_classes = len(set(y))
model = LSTMClassifier(input_size=n_letters, hidden_size=16, num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

#Trenujemy model
for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch, lengths in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch, lengths)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1:02d} | Loss: {total_loss / len(train_loader):.4f}")

#Ocena modelu
def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_batch, y_batch, lengths in loader:
            outputs = model(X_batch, lengths)
            _, pred = torch.max(outputs, dim=1)  # wybiera klasę z najwyższym prawdopodobieństwem
            correct += (pred == y_batch).sum().item()
            total += y_batch.size(0)

    acc = 100 * correct / total
    print(f"Accuracy: {acc:.2f}%")

evaluate(model, test_loader)

Epoch 01 | Loss: 0.7415
Epoch 02 | Loss: 0.6946
Epoch 03 | Loss: 0.6448
Epoch 04 | Loss: 0.6048
Epoch 05 | Loss: 0.5810
Accuracy: 77.71%
