In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from lightgbm import log_evaluation

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Analisando o banco de dados

In [None]:
def read_fasta_file(file_path):
    entries = []
    with open(file_path, "r") as fasta_file:
        lines = fasta_file.readlines()
        i = 0
        while i < len(lines):
            if lines[i].startswith(">"):
                entry = {}
                entry['EntryID'] = lines[i][1:].split()[0]
                
                #entry["OX"] = lines[i].split("OX=")[1].split()[0]
                i += 1
                sequence_lines = []
                while i < len(lines) and not lines[i].startswith(">"):
                    sequence_lines.append(lines[i].strip())
                    i += 1
                entry['seq'] = "".join(sequence_lines)
                entries.append(entry)
            else:
                i += 1
    df = pd.DataFrame(entries)
    df.set_index('EntryID', inplace=True)
    return df

In [None]:
# Entrada
trainFasta = read_fasta_file('/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta')

# Saída
trainTermsID = pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv',
                           sep='\t')
# Teste
testFasta = read_fasta_file('/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta')

print(trainTermsID['term'].value_counts(normalize=True)*100)
print(trainTermsID['term'].nunique())

In [None]:
trainTermsID['aspect'].value_counts()

In [None]:
trainFasta

In [None]:
testFasta

In [None]:
trainTermsID

In [None]:
trainTermsID['EntryID'].value_counts().mean()

In [None]:
trainFull = trainTermsID.merge(trainFasta, on='EntryID', how='left')
#trainFull.drop('aspect', axis=1, inplace=True)
trainFull

In [None]:
def keep_most_frequent_outputs(df, percentage):
    # Calculate output frequency
    output_counts = df['term'].value_counts()

    # Calculate the number of classes to keep based on the percentage
    n = int(len(output_counts) * percentage)

    # Get the "n" most frequent outputs
    most_frequent_outputs = output_counts.head(n).index

    # Filter the DataFrame to keep rows with the most frequent outputs
    filtered_df = df[df['term'].isin(most_frequent_outputs)].copy()

    # Calculate the percentage of the database removed
    removed_percentage = (1 - len(filtered_df) / len(df)) * 100
    print("Percentage of database removed: {:.2f}%".format(removed_percentage))
    
    return filtered_df

trainFullD = keep_most_frequent_outputs(trainFull, 0.1)
trainFullD.drop('EntryID', axis=1, inplace=True)

In [None]:
trainFullD

# Separando em 3 grupos

In [None]:
trainBPO = trainFullD[trainFullD['aspect'] == 'BPO'].copy()
trainBPO.drop('aspect', axis=1, inplace=True)
trainCCO = trainFullD[trainFullD['aspect'] == 'CCO'].copy()
trainCCO.drop('aspect', axis=1, inplace=True)
trainMFO = trainFullD[trainFullD['aspect'] == 'MFO'].copy()
trainMFO.drop('aspect', axis=1, inplace=True)

In [None]:
print(trainMFO['term'].value_counts())
print(trainMFO['seq'].nunique())

In [None]:
def get_most_common(df):
    # Get value counts of the term column
    term_counts = df['term'].value_counts()
    
    # Add a column to df with the count of each term
    df['term_count'] = df['term'].map(term_counts)
    
    # Create an output dataframe with unique sequences
    unique_seqs = df['seq'].unique()
    output_df = pd.DataFrame({'seq': unique_seqs, 'term': ''})
    
    # Create a dictionary to store the most common term for each sequence
    most_common_terms = {}
    
    # Iterate over each unique sequence with tqdm progress bar
    for sequence in tqdm(unique_seqs, desc='Processing sequences', leave=True):
        # Filter the df to find all rows with the sequence
        seq_rows = df.query("seq == @sequence")
        
        # Get the term with the highest count
        most_common_term = seq_rows.nlargest(3, 'term_count')['term'].iloc[-1]
        
        # Store the most common term in the dictionary
        most_common_terms[sequence] = most_common_term
    
    # Update the 'term' column in the output dataframe using the stored most common terms
    output_df['term'] = output_df['seq'].map(most_common_terms)
    
    return output_df

trainBPOs = get_most_common(trainBPO)
print('BPO pronto')
trainCCOs = get_most_common(trainCCO)
print('CCO pronto')
trainMFOs = get_most_common(trainMFO)
print('MFO pronto')

# Modelo

In [None]:
# Copiando para processar
trainBPO = trainBPOs.copy()
trainCCO = trainCCOs.copy()
trainMFO = trainMFOs.copy()

In [None]:
trainBPO

In [None]:
def create_model(df):
    # Calculate class counts
    class_counts = df['term'].value_counts()
    
    # Find the class with only one sample
    class_to_drop = class_counts[class_counts == 1].index
    
    if len(class_to_drop) > 0:
        # Drop the class with only one sample from df
        df = df[~df['term'].isin(class_to_drop)]
    
    gc.collect()
    # Split the data into training and testing sets with stratification
    Xtrain, Xval, ytrain, yval = train_test_split(df['seq'], df['term'], test_size=0.2,
                                                  random_state=42, stratify=df['term'])
    
    # Preprocess the training and validation data
    todasLetras = 'abcdefghijklmnopqrstuvwxyz'
    vectorizer = CountVectorizer(analyzer='char', vocabulary=todasLetras)
    Xtrain = vectorizer.fit_transform(Xtrain)
    Xval = vectorizer.transform(Xval)
    
    # Convert to float64
    Xtrain = Xtrain.astype('float64')
    Xval = Xval.astype('float64')
    gc.collect()
    
    # Create the LGBMClassifier model
    model =  LGBMClassifier(random_state=42)
    gc.collect()
    # Train the model with early stopping
    model.fit(Xtrain, ytrain,
              callbacks=[early_stopping(100), log_evaluation(100)],
              eval_metric='logloss',
              eval_set=[(Xval, yval)])
    gc.collect()
    
    print(model.score(Xval,yval))
    return vectorizer, model

vectBPO, modelBPO = create_model(trainBPO)
vectCCO, modelCCO = create_model(trainCCO)
vectMFO, modelMFO = create_model(trainMFO)

In [None]:
gc.collect()

# Geração da submissão

In [None]:
XBPO = vectBPO.transform(testFasta['seq']).astype('float64')
XCCO = vectCCO.transform(testFasta['seq']).astype('float64')
XMFO = vectMFO.transform(testFasta['seq']).astype('float64')
print('Transformação feita')

predBPO = modelBPO.predict(XBPO)
predCCO = modelCCO.predict(XCCO)
predMFO = modelMFO.predict(XMFO)
print('Predição feita')

probBPO = modelBPO.predict_proba(XBPO).max(axis=1)
probCCO = modelCCO.predict_proba(XCCO).max(axis=1)
probMFO = modelMFO.predict_proba(XMFO).max(axis=1)
print('Probabilidade feita')

In [None]:
# Combinando predições

# Create separate DataFrames for each class
df_bpo = pd.DataFrame({'EntryID': testFasta.index.values,
                       'Prediction': predBPO,
                       'Probability': probBPO})

df_cco = pd.DataFrame({'EntryID': testFasta.index.values,
                       'Prediction': predCCO,
                       'Probability': probCCO})

df_mfo = pd.DataFrame({'EntryID': testFasta.index.values,
                       'Prediction': predMFO,
                       'Probability': probMFO})

# Concatenate the DataFrames vertically
submission_df = pd.concat([df_bpo, df_cco, df_mfo], ignore_index=True)

# Save the DataFrame as submission.tsv without headers
submission_df.to_csv('submission.tsv', sep='\t', header=False, index=False)
print(submission_df['Prediction'].value_counts())
submission_df