In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset

In [5]:
df = pd.read_csv('/content/normal_tissue.tsv', sep='\t')
df = df.head(2000)

In [6]:
df.head()

Unnamed: 0,Gene,Gene name,Tissue,Cell type,Level,Reliability
0,ENSG00000000003,TSPAN6,adipose tissue,adipocytes,Not detected,Approved
1,ENSG00000000003,TSPAN6,adrenal gland,glandular cells,Not detected,Approved
2,ENSG00000000003,TSPAN6,appendix,glandular cells,Medium,Approved
3,ENSG00000000003,TSPAN6,appendix,lymphoid tissue,Not detected,Approved
4,ENSG00000000003,TSPAN6,bone marrow,hematopoietic cells,Not detected,Approved


In [7]:

df.isnull().sum()

Unnamed: 0,0
Gene,0
Gene name,0
Tissue,1
Cell type,1
Level,1
Reliability,0


In [8]:
for column in df.columns:
    # Calculate the mode for the column
    mode_value = df[column].mode()
    # Fill NaN values with the mode (taking the first mode if multiple)
    if not mode_value.empty:
        df[column].fillna(mode_value[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value[0], inplace=True)


In [9]:
df.isnull().sum()

Unnamed: 0,0
Gene,0
Gene name,0
Tissue,0
Cell type,0
Level,0
Reliability,0


In [10]:
df['Reliability'].replace({'Approved': 0, 'Enhanced': 1, 'Uncertain': 2, 'Supported': 3}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Reliability'].replace({'Approved': 0, 'Enhanced': 1, 'Uncertain': 2, 'Supported': 3}, inplace=True)
  df['Reliability'].replace({'Approved': 0, 'Enhanced': 1, 'Uncertain': 2, 'Supported': 3}, inplace=True)


In [11]:

X = df.drop('Reliability', axis=1)
y = df['Reliability']

In [12]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(1600, 5)
(400, 5)
(1600,)
(400,)


In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
X_train['text'] = X_train['Tissue'] + ' ' + X_train['Cell type'] + ' ' + X_train['Gene'] + ' ' + X_train['Level']
X_test['text'] = X_test['Tissue'] + ' ' + X_test['Cell type'] + ' ' + X_test['Gene'] + ' ' + X_test['Level']


In [16]:
print(X_train['text'].head())
print('\n')
print(X_test['text'].head())

968    cerebral cortex neuropil ENSG00000001561 Not d...
240     adipose tissue adipocytes ENSG00000000460 Medium
819            skin 1 keratinocytes ENSG00000001461 High
692    cerebral cortex endothelial cells ENSG00000001...
420    cerebral cortex neuronal cells ENSG00000000971...
Name: text, dtype: object


1860       hippocampus glial cells ENSG00000002587 Medium
353     kidney cells in glomeruli ENSG00000000938 Not ...
1333    nasopharynx respiratory epithelial cells ENSG0...
905     lymph node germinal center cells ENSG000000014...
1289    vagina squamous epithelial cells ENSG000000016...
Name: text, dtype: object


In [17]:
train_encodings = tokenizer(X_train['text'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test['text'].tolist(), truncation=True, padding=True, max_length=128)

In [18]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [19]:
Y_train.head()

Unnamed: 0,Reliability
968,0
240,2
819,0
692,1
420,3


In [20]:
Y_test.head()

Unnamed: 0,Reliability
1860,0
353,1
1333,0
905,3
1289,2


In [21]:
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'],
                                    'attention_mask': train_encodings['attention_mask'],
                                    'labels': Y_train})

test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'],
                                   'attention_mask': test_encodings['attention_mask'],
                                   'labels': Y_test})

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [23]:

trainer.train()

# Evaluate the model
trainer.evaluate()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,0.882034
2,No log,0.05035
3,No log,0.016277


{'eval_loss': 0.01627664640545845,
 'eval_runtime': 0.6632,
 'eval_samples_per_second': 603.154,
 'eval_steps_per_second': 37.697,
 'epoch': 3.0}

Now Freezing the layers in the model

In [24]:
for param in model.bert.embeddings.parameters():
    param.requires_grad = False
for i in range(6):  # Freezing the first 6 layers (you can change this number)
    for param in model.bert.encoder.layer[i].parameters():
        param.requires_grad = False

In [25]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluate each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',            # directory for logs
    logging_steps=10,
    save_strategy="epoch",
)



In [27]:
trainer2 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [28]:
trainer2.train()

# Evaluate the model
results = trainer2.evaluate()
print("Evaluation results:", results)

Epoch,Training Loss,Validation Loss
1,0.0476,0.00157
2,0.0078,0.000856
3,0.001,0.000707


Evaluation results: {'eval_loss': 0.0007073960732668638, 'eval_runtime': 0.7961, 'eval_samples_per_second': 502.47, 'eval_steps_per_second': 31.404, 'epoch': 3.0}


In [31]:
pip install peft

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2


In [32]:
from peft import LoraConfig, get_peft_model

Using LoRa Technique

In [34]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['query', 'key', 'value'],
    lora_dropout=0.1,
    bias="none"
)


In [35]:
model3 = get_peft_model(model, lora_config)

In [37]:
from transformers import AdamW

In [38]:
optimizer = AdamW(model.parameters(), lr=2e-5)



In [39]:
trainer3 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optimizer, None),        # Custom optimizer
)

In [40]:
trainer3.train()

# Evaluate the model
results = trainer3.evaluate()
print("Evaluation results:", results)

Epoch,Training Loss,Validation Loss
1,0.0419,0.000705
2,0.0011,0.000707
3,0.001,0.000707


Evaluation results: {'eval_loss': 0.0007074868772178888, 'eval_runtime': 0.837, 'eval_samples_per_second': 477.902, 'eval_steps_per_second': 29.869, 'epoch': 3.0}


In [41]:
predictions = trainer.predict(test_dataset)
y_predicted = np.argmax(predictions.predictions, axis=1)


In [42]:
print("Predicted classes:", y_predicted)


Predicted classes: [0 1 0 3 2 2 3 3 0 0 0 0 1 0 1 2 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 3 0 3 0 3 1
 0 0 2 0 0 2 0 1 0 3 3 0 1 0 2 3 0 3 3 2 3 1 1 3 0 1 0 0 0 0 0 0 0 3 1 3 0
 1 1 1 0 1 3 0 3 0 0 3 3 3 3 2 1 0 3 0 2 0 0 0 2 0 0 3 0 3 0 0 3 0 2 0 0 0
 3 0 3 1 1 0 0 1 1 3 0 0 3 1 0 3 3 1 1 0 0 1 0 0 2 0 0 3 0 0 2 1 3 1 0 0 2
 0 0 3 3 0 1 2 0 0 3 0 1 0 0 0 1 0 3 3 2 0 3 1 1 0 0 3 0 0 0 3 1 2 0 3 1 0
 1 2 1 1 2 0 2 3 0 0 3 0 3 1 0 0 0 0 1 3 0 3 0 0 0 3 0 2 0 1 3 1 1 3 2 3 0
 0 1 3 0 3 0 1 1 0 0 3 0 3 1 0 3 0 0 1 0 0 3 3 0 0 0 0 0 0 2 0 3 3 0 0 0 0
 2 0 0 0 0 0 1 1 0 0 0 0 0 3 1 0 0 0 3 0 0 0 3 0 3 0 0 2 3 0 0 2 1 3 1 0 2
 0 3 0 3 0 0 0 3 0 0 1 0 3 0 0 1 2 0 0 3 0 1 3 1 0 3 0 2 0 0 0 0 1 1 3 0 0
 0 1 3 1 0 1 3 3 2 0 0 0 1 1 0 0 0 0 3 3 1 0 0 3 3 3 1 3 1 0 0 0 3 3 1 1 2
 0 3 0 3 1 1 3 3 0 3 3 3 3 3 3 0 0 3 0 0 3 1 3 1 0 0 2 3 2 0]


In [45]:
predictions2 = trainer2.predict(test_dataset)
y_predicted2 = np.argmax(predictions2.predictions, axis=1)


In [46]:
print("Predicted classes:", y_predicted2)


Predicted classes: [0 1 0 3 2 2 3 3 0 0 0 0 1 0 1 2 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 3 0 3 0 3 1
 0 0 2 0 0 2 0 1 0 3 3 0 1 0 2 3 0 3 3 2 3 1 1 3 0 1 0 0 0 0 0 0 0 3 1 3 0
 1 1 1 0 1 3 0 3 0 0 3 3 3 3 2 1 0 3 0 2 0 0 0 2 0 0 3 0 3 0 0 3 0 2 0 0 0
 3 0 3 1 1 0 0 1 1 3 0 0 3 1 0 3 3 1 1 0 0 1 0 0 2 0 0 3 0 0 2 1 3 1 0 0 2
 0 0 3 3 0 1 2 0 0 3 0 1 0 0 0 1 0 3 3 2 0 3 1 1 0 0 3 0 0 0 3 1 2 0 3 1 0
 1 2 1 1 2 0 2 3 0 0 3 0 3 1 0 0 0 0 1 3 0 3 0 0 0 3 0 2 0 1 3 1 1 3 2 3 0
 0 1 3 0 3 0 1 1 0 0 3 0 3 1 0 3 0 0 1 0 0 3 3 0 0 0 0 0 0 2 0 3 3 0 0 0 0
 2 0 0 0 0 0 1 1 0 0 0 0 0 3 1 0 0 0 3 0 0 0 3 0 3 0 0 2 3 0 0 2 1 3 1 0 2
 0 3 0 3 0 0 0 3 0 0 1 0 3 0 0 1 2 0 0 3 0 1 3 1 0 3 0 2 0 0 0 0 1 1 3 0 0
 0 1 3 1 0 1 3 3 2 0 0 0 1 1 0 0 0 0 3 3 1 0 0 3 3 3 1 3 1 0 0 0 3 3 1 1 2
 0 3 0 3 1 1 3 3 0 3 3 3 3 3 3 0 0 3 0 0 3 1 3 1 0 0 2 3 2 0]


In [47]:
predictions3 = trainer3.predict(test_dataset)
y_predicted3 = np.argmax(predictions3.predictions, axis=1)


In [48]:
print("Predicted classes:", y_predicted3)


Predicted classes: [0 1 0 3 2 2 3 3 0 0 0 0 1 0 1 2 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 3 0 3 0 3 1
 0 0 2 0 0 2 0 1 0 3 3 0 1 0 2 3 0 3 3 2 3 1 1 3 0 1 0 0 0 0 0 0 0 3 1 3 0
 1 1 1 0 1 3 0 3 0 0 3 3 3 3 2 1 0 3 0 2 0 0 0 2 0 0 3 0 3 0 0 3 0 2 0 0 0
 3 0 3 1 1 0 0 1 1 3 0 0 3 1 0 3 3 1 1 0 0 1 0 0 2 0 0 3 0 0 2 1 3 1 0 0 2
 0 0 3 3 0 1 2 0 0 3 0 1 0 0 0 1 0 3 3 2 0 3 1 1 0 0 3 0 0 0 3 1 2 0 3 1 0
 1 2 1 1 2 0 2 3 0 0 3 0 3 1 0 0 0 0 1 3 0 3 0 0 0 3 0 2 0 1 3 1 1 3 2 3 0
 0 1 3 0 3 0 1 1 0 0 3 0 3 1 0 3 0 0 1 0 0 3 3 0 0 0 0 0 0 2 0 3 3 0 0 0 0
 2 0 0 0 0 0 1 1 0 0 0 0 0 3 1 0 0 0 3 0 0 0 3 0 3 0 0 2 3 0 0 2 1 3 1 0 2
 0 3 0 3 0 0 0 3 0 0 1 0 3 0 0 1 2 0 0 3 0 1 3 1 0 3 0 2 0 0 0 0 1 1 3 0 0
 0 1 3 1 0 1 3 3 2 0 0 0 1 1 0 0 0 0 3 3 1 0 0 3 3 3 1 3 1 0 0 0 3 3 1 1 2
 0 3 0 3 1 1 3 3 0 3 3 3 3 3 3 0 0 3 0 0 3 1 3 1 0 0 2 3 2 0]


In [49]:

input_text = ['kidney cells in glomeruli ENSG00000005175 High']
inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
dataset = Dataset.from_dict(inputs)
predictions = trainer.predict(dataset)

In [50]:
y_predicted_value = np.argmax(predictions.predictions, axis=1)


In [53]:
reverse_mapping = {0: 'Approved', 1: 'Enhanced', 2: 'Uncertain', 3: 'Supported'}

predicted_label = reverse_mapping[y_predicted_value[0]]  # Access the first element if it's an array
print(predicted_label)

Approved


In [54]:

input_text = ['kidney cells in glomeruli ENSG00000005175 High']
inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
dataset = Dataset.from_dict(inputs)
predictions2 = trainer2.predict(dataset)

In [55]:
y_predicted_value2 = np.argmax(predictions2.predictions, axis=1)


In [56]:
reverse_mapping = {0: 'Approved', 1: 'Enhanced', 2: 'Uncertain', 3: 'Supported'}

predicted_label2 = reverse_mapping[y_predicted_value2[0]]  # Access the first element if it's an array
print(predicted_label2)

Approved


In [57]:

input_text = ['kidney cells in glomeruli ENSG00000005175 High']
inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")
dataset = Dataset.from_dict(inputs)
predictions3 = trainer3.predict(dataset)

In [58]:
y_predicted_value3 = np.argmax(predictions3.predictions, axis=1)


In [59]:
reverse_mapping = {0: 'Approved', 1: 'Enhanced', 2: 'Uncertain', 3: 'Supported'}

predicted_label3 = reverse_mapping[y_predicted_value3[0]]  # Access the first element if it's an array
print(predicted_label3)

Approved
