# Protein Classificator

## Importing required packages

In [1]:
from Bio import SeqIO
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## Configuration of temperature labels and input files

In [2]:
temperature_labels_full = [37, 80]

files = ['data/proteomes/UP000000625_83333.fasta', 'data/proteomes/UP000001974_273057.fasta']

## Splitting the dataset for training, validation and testing

The proportions for splitting were chosen to be 70%, 15% and 15% for training, validation and testing respectively.

In [14]:
# Initialisation of the dataset container
data = {
    'train': {
        'X' : [],
        'Y' : [],
        'FASTA_prefix': 'data/FASTA/training_',
        'CSV_prefix': 'data/CSV/training_',
    },
    'validate': {
        'X' : [],
        'Y' : [],
        'FASTA_prefix': 'data/FASTA/validation_',
        'CSV_prefix': 'data/CSV/validation_'
    },
    'test': {
        'X' : [],
        'Y' : [],
        'FASTA_prefix': 'data/FASTA/testing_',
        'CSV_prefix': 'data/CSV/testing_'
    }
}

# Removing duplicate sequences in the dataset
seen = set()

for i in range(len(temperature_labels_full)):
    records = [] 
    # Parsing sequences (X dataset) from one dataset 
    for record in SeqIO.parse(files[i], "fasta"):
        if record.seq not in seen:
            seen.add(record.seq)
            records.append(record)

    # Creating Y dataset from temperature labels 
    temperature_labels = [temperature_labels_full[i]] * len(records)

    # Spliting the dataset to 70% (training) and 30% (trying)
    X_train, X_try, Y_train, Y_try = train_test_split(records, temperature_labels, test_size=0.3, shuffle=True, random_state=1)

    # Splitting 30% from the initial set in half for calidation and testing
    X_validate, X_test, Y_validate, Y_test = train_test_split(X_try, Y_try, test_size=0.5, shuffle=True, random_state=1)

    data['train']['X'] = data['train']['X'] + X_train
    data['train']['Y'] = data['train']['Y'] + Y_train
    data['validate']['X'] = data['validate']['X'] + X_validate
    data['validate']['Y'] = data['validate']['Y'] + Y_validate
    data['test']['X'] = data['test']['X'] + X_test
    data['test']['Y'] = data['test']['Y'] + Y_test

# Shuffling the datasets
for element in data.keys():
    data[element]['X'], data[element]['Y'] = shuffle(data[element]['X'], data[element]['Y'], random_state=1)



## Checking the success of parsing

In [15]:
print(len(seen))
print(len(data['train']['X']))
print(len(data['validate']['X']))
print(len(data['test']['X']))

7288
5101
1093
1094


## Creating files 

`[training|validation|testing]_sequences.fasta`

In [16]:
# Creating data/*_sequences.fasta files

for element in data.keys():
    file_name = data[element]['FASTA_prefix']+'sequences.fasta'
    file_handle = open(file_name, 'w')
    for record in data[element]['X']:
        file_handle.write('>'+record.name.split('|')[1])
        file_handle.write("\n")
        file_handle.write(str(record.seq))
        file_handle.write("\n")
    file_handle.close()

  `[training|validation|testing]_temperature_annotations.csv`

In [17]:
# Creating data/*_temperature_annotations.csv files

for element in data.keys():
    file_name = data[element]['CSV_prefix']+'temperature_annotations.csv'
    file_handle = open(file_name, 'w')
    file_handle.write('identifier,label'+"\n")
    for i in range(len(data[element]['X'])):
        file_handle.write(data[element]['X'][i].name.split('|')[1]+','+str(data[element]['Y'][i]))
        file_handle.write("\n")
    file_handle.close()

Installing `bio_embeddings` tool

In [16]:
!pip install bio_embeddings[all]

Found existing installation: bio-embeddings 0.2.2
Uninstalling bio-embeddings-0.2.2:
  Would remove:
    /Users/ieva/.local/bin/bio_embeddings
    /Users/ieva/.local/lib/python3.7/site-packages/LICENSE.md
    /Users/ieva/.local/lib/python3.7/site-packages/README.md
    /Users/ieva/.local/lib/python3.7/site-packages/bio_embeddings-0.2.2.dist-info/*
    /Users/ieva/.local/lib/python3.7/site-packages/bio_embeddings/*
    /Users/ieva/.local/lib/python3.7/site-packages/pyproject.toml
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m


In [19]:
!bio_embeddings config.yml

2021-10-29 21:23:33,107 INFO Created the prefix directory temperature_sampled
2021-10-29 21:23:33,107 INFO Created the file temperature_sampled/input_parameters_file.yml
2021-10-29 21:23:33,644 INFO Created the file temperature_sampled/sequences_file.fasta
2021-10-29 21:23:33,723 INFO Created the file temperature_sampled/mapping_file.csv
2021-10-29 21:23:33,723 INFO Created the file temperature_sampled/remapped_sequences_file.fasta
2021-10-29 21:23:33,778 INFO Created the stage directory temperature_sampled/protbert_embeddings
2021-10-29 21:23:33,779 INFO Created the file temperature_sampled/protbert_embeddings/input_parameters_file.yml
2021-10-29 21:23:33,783 INFO Downloading model_folder_zip for prottrans_bert_bfd and storing in '/var/folders/yv/yvlj08j53rx4_7w0gg8zlbcc0000gp/T/tmppgw23qfi'.
^C
