In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import matplotlib.pyplot as plt
import gc
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from lightgbm import log_evaluation

import scipy.sparse as sp
from sklearn.metrics import accuracy_score, precision_score, recall_score

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Carregando o banco de dados

In [None]:
def read_fasta_file(file_path):
    entries = []
    with open(file_path, "r") as fasta_file:
        lines = fasta_file.readlines()
        i = 0
        while i < len(lines):
            if lines[i].startswith(">"):
                entry = {}
                entry['EntryID'] = lines[i][1:].split()[0]
                
                #entry["OX"] = lines[i].split("OX=")[1].split()[0]
                i += 1
                sequence_lines = []
                while i < len(lines) and not lines[i].startswith(">"):
                    sequence_lines.append(lines[i].strip())
                    i += 1
                entry['seq'] = "".join(sequence_lines)
                entries.append(entry)
            else:
                i += 1
    df = pd.DataFrame(entries)
    df.set_index('EntryID', inplace=True)
    return df

In [None]:
# Processados
trainBPO = pd.read_csv('/kaggle/input/cafa-5-ohe-wm/trainBPO.csv')
trainCCO = pd.read_csv('/kaggle/input/cafa-5-ohe-wm/trainCCO.csv')
trainMFO = pd.read_csv('/kaggle/input/cafa-5-ohe-wm/trainMFO.csv')

# Teste
testFasta = read_fasta_file('/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta')

testFasta

In [None]:
trainBPO

In [None]:
trainCCO

In [None]:
trainMFO

# Treinamento

In [None]:
def get_model():
    return ClassifierChain(
        base_estimator=LGBMClassifier(
            objective='binary',
            boosting_type='gbdt',
            random_state=42
        ),
        random_state=42
    )

def train_model(df):
    # Create a count vectorizer on character level
    todasLetras = 'abcdefghijklmnopqrstuvwxyz'
    vectorizer = CountVectorizer(analyzer='char', vocabulary=todasLetras)
    
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        df['seq'], df.drop('seq', axis=1), test_size=0.2, random_state=42
    )
    
    # Fit vectorizer on X_train and transform X_train and X_val
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    
    # Convert X_train, X_val, y_train, y_val to np.float32
    X_train = X_train.astype(np.float32)
    X_val = X_val.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_val = y_val.astype(np.float32)
    
    # Create the model
    model = get_model()
    
    # Fit the model with tqdm progress bar
    model.fit(X_train, y_train)
    
    # Evaluate the model on the validation set
    y_val_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='micro')
    recall = recall_score(y_val, y_val_pred, average='micro')
    
    # Print the evaluation metrics
    print("Accuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    
    return vectorizer, model

print('Training BPO')
vecBPO, modBPO = train_model(trainBPO)
print('\nTraining CCO')
vecCCO, modCCO = train_model(trainCCO)
print('\nTraining MFO')
vecMFO, modMFO = train_model(trainMFO)

# Geração da submissão

In [None]:
XBPO = vecBPO.transform(testFasta['seq']).astype(np.float32)
XCCO = vecCCO.transform(testFasta['seq']).astype(np.float32)
XMFO = vecMFO.transform(testFasta['seq']).astype(np.float32)
gc.collect()
print('Transformação feita')

In [None]:
predBPO = modBPO.predict_proba(XBPO)
gc.collect()
print('BPO completo')
predCCO = modCCO.predict_proba(XCCO)
gc.collect()
print('CCO completo')
predMFO = modMFO.predict_proba(XMFO)
gc.collect()
print('MFO completo')

In [None]:
def process_matrix(matrix):
    # Check for values less than 0.1 and set them to 0
    matrix[matrix < 0.1] = 0
    
    # Convert to sparse matrix if it's not already sparse
    if not isinstance(matrix, sp.csr_matrix):
        matrix = sp.csr_matrix(matrix)
    
    return matrix

predBPO = process_matrix(predBPO)
print('BPO completo')
gc.collect()
predCCO = process_matrix(predCCO)
print('CCO completo')
gc.collect()
predMFO = process_matrix(predMFO)
print('MFO completo')
gc.collect()

In [None]:
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text

def reorganize_output(predictions, id_column, classes):
    output_rows = []
    for i, id_val in tqdm(enumerate(id_column), total=len(id_column), desc="Reorganizing Output"):
        class_probas = predictions[i].toarray()[0]  # Convert sparse matrix to array
        non_zero_indices = class_probas.nonzero()[0]  # Get indices of non-zero values
        for class_idx in non_zero_indices:
            class_proba = class_probas[class_idx]
            class_label = classes[class_idx]
            class_label = remove_prefix(class_label, 'term_')
            output_rows.append({'id': id_val, 'term': class_label, 'proba': class_proba})
        if i % 1000 == 0:
            gc.collect()
    output_df = pd.DataFrame(output_rows)
    return output_df

# Reorganize the output
outBPO = reorganize_output(predBPO, testFasta.index,
                           trainBPO.drop('seq', axis=1).columns)
print('BPO completo')
outCCO = reorganize_output(predCCO, testFasta.index,
                           trainCCO.drop('seq', axis=1).columns)
print('CCO completo')
outMFO = reorganize_output(predMFO, testFasta.index,
                           trainMFO.drop('seq', axis=1).columns)
print('MFO completo')

# Print the reorganized output
submission = pd.concat([outBPO, outCCO, outMFO], ignore_index=True)
print(submission['term'].value_counts())
submission

In [None]:
submission.to_csv('submission.tsv', sep='\t', header=False, index=False)

In [None]:
gc.collect()