In [3]:
import torch
from torch import nn
from torch import optim

from sklearn.model_selection import train_test_split

#import tqdm
import numpy as np
import pandas as pd

In [4]:
RANDOM_SEED = 42
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()

    acc = (correct/len(y_pred)) * 100
    return acc

In [60]:
df = pd.read_csv("archive/2018-06-06-ss.cleaned.csv")

df = df[df["seq"].str.contains("[*]") == False]

df = df.loc[df["seq"].str.len() < 101]

In [61]:
df

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa
0,1A30,C,EDL,CBC,CEC,3,False
1,1B05,B,KCK,CBC,CEC,3,False
2,1B0H,B,KAK,CBC,CEC,3,False
3,1B1H,B,KFK,CBC,CEC,3,False
4,1B2H,B,KAK,CBC,CEC,3,False
...,...,...,...,...,...,...,...
61919,6EM4,i,MAVKTGIAIGLNKGKKVTQMTPAPKISYKKGAASNRTKFVRSLVRE...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCSCCCCCHHHHHHHHHHHH...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHH...,100,False
61920,6EM5,i,MAVKTGIAIGLNKGKKVTQMTPAPKISYKKGAASNRTKFVRSLVRE...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCTTCCCHHHHHHHHHHHH...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHH...,100,False
61921,6F7J,A,RYNDYKLDFRRQQMQDFFLAHKDEEWFRSKYHPDEVGKRRQEARGA...,CCCCHHHHHHHHHHHHHHHHTSSCHHHHHHHCHHHHHHHHHHHHHH...,CCCCHHHHHHHHHHHHHHHHCCCCHHHHHHHCHHHHHHHHHHHHHH...,100,False
61922,6F8D,A,RYNDYKLDFRRQQMQDFFLAHKDEEWFRSKYHPDEVGKRRQEARGA...,CCCCCHHHHHHHHHHHHHHHTSSCHHHHHHHCHHHHHHHHHHHHHH...,CCCCCHHHHHHHHHHHHHHHCCCCHHHHHHHCHHHHHHHHHHHHHH...,100,False


In [18]:
class ProteinStructure(nn.Module):
    def __init__(self, input_dim, hidden_units):
        super().__init__()
        
        self.layer = nn.Sequential(
            nn.Linear(in_features=input_dim, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=2*hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=2*hidden_units, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=3)      
        )
        
    def forward(self, x):
        return self.layer(x)
    
torch.manual_seed(RANDOM_SEED)
model = ProteinStructure(input_dim=2, hidden_units=64).to(device)

In [62]:
df["seq"]

0                                                      EDL
1                                                      KCK
2                                                      KAK
3                                                      KFK
4                                                      KAK
                               ...                        
61919    MAVKTGIAIGLNKGKKVTQMTPAPKISYKKGAASNRTKFVRSLVRE...
61920    MAVKTGIAIGLNKGKKVTQMTPAPKISYKKGAASNRTKFVRSLVRE...
61921    RYNDYKLDFRRQQMQDFFLAHKDEEWFRSKYHPDEVGKRRQEARGA...
61922    RYNDYKLDFRRQQMQDFFLAHKDEEWFRSKYHPDEVGKRRQEARGA...
61923    RYNDYKLDFRRQQMQDFFLAHKDEEWFRSKYHPDEVGKRRQEARGA...
Name: seq, Length: 57067, dtype: object

In [64]:
df["sst3"]

0                                                      CEC
1                                                      CEC
2                                                      CEC
3                                                      CEC
4                                                      CEC
                               ...                        
61919    CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHH...
61920    CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCHHHHHHHHHHHH...
61921    CCCCHHHHHHHHHHHHHHHHCCCCHHHHHHHCHHHHHHHHHHHHHH...
61922    CCCCCHHHHHHHHHHHHHHHCCCCHHHHHHHCHHHHHHHHHHHHHH...
61923    CCCCCCHHHHHHHHHHHHHHCCCCHHHHHHHCHHHHHHHHHHHHHH...
Name: sst3, Length: 57067, dtype: object

In [65]:
labels = ["C", "E", "H"]

In [66]:
import collections

def most_frequent_char(string):
    char_frequency = {}
    for char in string:
        if char in char_frequency:
            char_frequency[char] += 1
        else:
            char_frequency[char] = 1

    max_char = None
    max_frequency = 0
    for char, frequency in char_frequency.items():
        if frequency > max_frequency:
            max_frequency = frequency
            max_char = char

    return max_char

def value(x):
    return most_frequent_char(x[1])


In [67]:
def numberize(x):
    labels = np.zeros(shape=100)
    #labels = [0] * 5037
    
    for index, char in enumerate(x):
        labels[index] = ord(char)
    
    return labels
    

In [71]:
#df["label"] = labels.index(value(df["sst3"]))

df["label"] = df.apply(lambda row: labels.index(value(row["sst3"])), axis=1)

x = np.array([np.zeros(shape=100)])

#for row in df["seq"]:#
#    x_blæ = np.append(x_blæ, [numberize(row)], axis=0)

df["x"] = df.apply(lambda row: numberize(row["seq"]), axis=1)

for row in df["x"]:
    x = np.append(x, [row], axis=0)

df.groupby("label").count()

Unnamed: 0_level_0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa,x
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,47023,47023,47023,47023,47023,47023,47023,47023
1,6654,6654,6654,6654,6654,6654,6654,6654
2,3390,3390,3390,3390,3390,3390,3390,3390


In [48]:
df.head()

Unnamed: 0,pdb_id,chain_code,seq,sst8,sst3,len,has_nonstd_aa,label,x
0,1A30,C,EDL,CBC,CEC,3,False,1,"[69.0, 68.0, 76.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
1,1B05,B,KCK,CBC,CEC,3,False,1,"[75.0, 67.0, 75.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
2,1B0H,B,KAK,CBC,CEC,3,False,1,"[75.0, 65.0, 75.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
3,1B1H,B,KFK,CBC,CEC,3,False,1,"[75.0, 70.0, 75.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."
4,1B2H,B,KAK,CBC,CEC,3,False,1,"[75.0, 65.0, 75.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...."


In [85]:
arr = np.array([[1,2], [2,3]])
print(arr)
arr = np.append(arr, [[3,4]], axis=0)
arr

[[1 2]
 [2 3]]


array([[1, 2],
       [2, 3],
       [3, 4]])

In [70]:
x

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]])

In [69]:
X = torch.from_numpy(x)
y = torch.tensor(df["label"].values)

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

torch.Size([1, 100]) torch.Size([57067])


ValueError: Found input variables with inconsistent numbers of samples: [1, 57067]

### Training loop

In [25]:
epochs = 10
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=0.01)


for epoch in range(epochs):
    model.train()

    y_logits = model(X_train)
    y_pred = torch.softmax(y_logits, dim=1).argmax(dim=1)

    loss = loss_fn(y_logits, y_train)
    acc = accuracy_fn(y_true=y_train, y_pred=y_pred)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.inference_mode():
        y_test_logits = model(X_test)
        y_test_pred = torch.softmax(y_test_logits, dim=1).argmax(dim=1)

        test_loss = loss_fn(y_test_logits, y_blob_test)
        test_acc = accuracy_fn(y_true=y_test, y_pred=y_test_pred)

    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f} | Acc: {acc:.2f}% | Test Loss: {test_loss:.5f} | Test Acc: {test_acc:.2f}%")



NameError: name 'X_train' is not defined

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

# Load the dataset and preprocess the data
data = pd.read_csv("protein_data.csv")
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Encode the target variable
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = to_categorical(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define the model architecture
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32)

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)