# Resistance plant gene prediction - GPT Embeddings
---

## Libraries

In [20]:
# Data manipulation
import pandas as pd

# Request for GPT embeddings
from openai import OpenAI

## Data loading

In [21]:
molecules = pd.read_csv('Data/train.csv')

In [22]:
molecules.sample(6)

Unnamed: 0,ID,Sequence,Label
309,433,MRFKQPSRRDRPVMFKRSKNVSVDVGVDSISDLPDAVLQHIFSYIP...,0
10402,13931,MTGKRSKTNCRSASHKLFKDKAKNRVDDLQGMLLDLQFARKESRPT...,0
12573,16794,METLVSSIFWTLAPWKNMLLLKHGRIEILDQNTMYGWYELPKQEFL...,0
2461,3297,MEHKETGCQQPEGPILCINNCGFFGSAATMNMCSKCHKEMIMKQEQ...,0
14081,18789,MDQNGHNDEAETVSCGNGNCKSKIVPGDDHGGDESSGTKRRKKRKT...,0
11775,15751,MGLCWGSPSDSPPTTTPSSTGNISSVGTFKSSNNTTTTGTSRGSNI...,1


In [23]:
molecules.iloc[0, 1]

'MDSRMDQYEVMEQIGRGAFGAAILVNHKTEKKKYVLKKIRLARQTERCRKSAHQEMALIARLQHPYIVEFKEAWVEKGCYVCIVTGYCEGGDMAELMKKANGTYFPEEKLLKWFAQLALAVDYLHSNFVLHRDLKCSNIFLTKDQDIRLGDFGLAKTLKADDLTSSVVGTPNYMCPELLADIPYGFKSDIWSLGCCMYEMAAHRPAFKAFDMAGLISKINRSSIGPLPPCYSPSMKSLIKSMLRKSPEHRPTASEILKSPYLQPYVNQYRPFADISHPIHSLEKPITSSRSSQKSMSGSQCSSISGSDIDSIQSSERNTSGPSTSSNNTIDTEGAEATDHVSVKNCSRSDDVKSNKETVGPELERQDSSKSIHVDQRPRNEIKQPKIIKKILTTLREESKLRQNNSPIRASRVKLNSPSNREQLSDDSKHSSDISSSSKSSEVTSRESAKVICEPVKRAQASPPLKHLSPIVEHSPKAKIKQDEPLQPDPAKQAMEDVDAAVGKVKNRTPPSYSRRLSIPPRRPLGAESPLHADTKRAHNKVIKERAKSPCRPVHGPDNDIIEPPGFPMAPPSPLGGVQMKVGNARAKSAPPRAVSIKEDSSDCSSSTIAYAENTELSEPSKQDSSAQLVSSCKCSIPDAAIQKHDLTAMPSSELNTTNFQKSMASNDDVCENLALEPSSDISEQVSIFKDNVPCSKISQSTANAIVQNDEDKFTVQELLSSVADIAPFVSTKNFALEKGSPPIQSLERTSSPHLNPPIEDVIHVIRHSSFRVCGEQAVAENAEMGVQSSDVGKLLNVVREEVDSRSIPSNNLVPHRLPDCAAPKPNISETNTISSKTACSDVVKFLTVPEVNSTTTAINNGFKEEASPTKEILDVKSFRQRAEALEGLLELSADLLQHNRLEELAVVLKPFGKDKVSPRETAIWLAKSFKGMMNDEASRSSM'

## ChatGPT embeddings

In [24]:
gpt_keys = pd.read_csv('../ChatGPT API Keys.txt').columns

In [25]:
api_key = gpt_keys[0]
org_key = gpt_keys[1]

In [26]:
client = OpenAI(api_key=api_key, organization=org_key)

In [27]:
for model in client.models.list():
    print(model)

Model(id='whisper-1', created=1677532384, object='model', owned_by='openai-internal')
Model(id='tts-1', created=1681940951, object='model', owned_by='openai-internal')
Model(id='dall-e-2', created=1698798177, object='model', owned_by='system')
Model(id='tts-1-hd-1106', created=1699053533, object='model', owned_by='system')
Model(id='tts-1-hd', created=1699046015, object='model', owned_by='system')
Model(id='gpt-4-turbo-2024-04-09', created=1712601677, object='model', owned_by='system')
Model(id='gpt-4-turbo', created=1712361441, object='model', owned_by='system')
Model(id='gpt-3.5-turbo-1106', created=1698959748, object='model', owned_by='system')
Model(id='dall-e-3', created=1698785189, object='model', owned_by='system')
Model(id='gpt-4-0125-preview', created=1706037612, object='model', owned_by='system')
Model(id='gpt-4-turbo-preview', created=1706037777, object='model', owned_by='system')
Model(id='text-embedding-3-small', created=1705948997, object='model', owned_by='system')
Model

In [35]:
def get_gpt_embedding(client, text, model="text-embedding-3-small"):
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

In [40]:
def process_sequences_in_batches(molecules, batch_size, client, model="text-embedding-3-small"):
    embeddings = []
    try:
        for i in range(0, len(molecules), batch_size):
            batch = molecules['Sequence'].iloc[i:i+batch_size]
            batch_embeddings = batch.apply(lambda x: get_gpt_embedding(client, x, model=model))
            embeddings.extend(batch_embeddings)
    except Exception as ex:
        print(ex.args)
        return embeddings
    return embeddings

In [43]:
molecules.iloc[:3, [1]]

Unnamed: 0,Sequence
0,MDSRMDQYEVMEQIGRGAFGAAILVNHKTEKKKYVLKKIRLARQTE...
1,MAPKAEKKPAEKKPAEEKAGEKAPAAGKKPKAEKRLPASKGEKGGE...
2,MLELHFEFIDLNQPKMYKFVVCLLTLSFLLLSGLSNTALARVHHES...


In [41]:
batch_size = 2
embeddings = process_sequences_in_batches(molecules.iloc[:3, [1]], batch_size, client)

In [42]:
pd.DataFrame(embeddings)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.04215,-0.008936,0.017308,0.014519,0.009914,0.033132,0.014831,-0.011494,-0.037107,0.000292,...,-0.024293,-0.029721,-0.061163,0.005347,-0.011709,0.029009,-0.008431,0.016863,0.00647,0.014646
1,0.02572,-0.014849,0.021008,0.00883,0.004268,0.002514,0.027056,-0.014842,-0.015635,0.020377,...,-0.021977,-0.001784,-0.048916,0.005652,-0.029655,0.034441,0.011025,0.019789,-0.025852,0.016824
2,0.021083,-0.009498,0.034506,0.021068,0.008089,0.014645,0.024011,0.026144,-0.032419,0.009428,...,-0.017471,-0.012192,-0.030083,0.004212,-0.015197,-0.000833,0.007334,-0.004959,-0.007283,0.041077
