In [None]:
# Bibs gerais
import gc
import numpy as np
import pandas as pd
import seaborn as sns

# Abordagens 1 e 4
import re
import unicodedata
from tkinter.tix import TCL_WINDOW_EVENTS
from sklearn.feature_extraction.text import TfidfVectorizer

from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from lightgbm import early_stopping
from lightgbm import log_evaluation
from tqdm import tqdm

# Abordagens 2 e 3
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import AutoModel

# Carregando arquivos
x_sub = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv',index_col='id')
y_sub = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/sample_submission.csv',index_col='id')
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv',index_col='id')

train.head()

In [None]:
# Instalando e importando tradução
!pip install googletrans==3.1.0a0

from googletrans import Translator

# Observando o dataset

In [None]:
train.isnull().sum()

In [None]:
train.describe()

In [None]:
sns.displot(train.label);

In [None]:
print(len(train.language.unique()))
print(train.language.value_counts())

In [None]:
def count_values_by_category(df, count_column_name, category_column_name):
    """
    Receives a pandas dataframe, a column with categorical values and a column
    to be counted, and returns a new dataframe with the count of occurrences of
    each count_column value for each category_column value.
    """
    # Group the DataFrame by the category_column and count the values in the count_column
    counts_df = df.groupby(category_column_name)[count_column_name].value_counts().unstack(fill_value=0)
    
    return counts_df

counts_df = count_values_by_category(train, 'label', 'lang_abv')
counts_df

# Abordagem 1: tradução

Copiando para processar

In [None]:
trainC = train.copy()

In [None]:
# Initialize the translator
#translator = Translator()

# Define a function to translate text to English
def translate_to_english(text, lang):
    if lang.lower() == 'en':
        # Return the original text for English rows
        return text
    elif lang.lower() == 'zh':
        # Translate Chinese rows to English
        return translator.translate(text, dest='en').text
    else:
        # Translate non-English rows to English
        return translator.translate(text, src=lang, dest='en').text

tqdm.pandas()

trainC['premise'] = trainC.progress_apply(lambda row: translate_to_english(row['premise'],
                                                                           row['lang_abv']), axis=1)
trainC['hypothesis'] = trainC.progress_apply(lambda row: translate_to_english(row['hypothesis'],
                                                                              row['lang_abv']), axis=1)

Funções para remoção de complexidade

In [None]:
def remove_url_func(text):
    '''
    Removes URL addresses from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without URL addresses
    ''' 
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_punctuation_func(text):
    '''
    Removes all punctuation from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without punctuations
    '''
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

def remove_extra_whitespaces_func(text):
    '''
    Removes extra whitespaces from a string, if present

    Args:
        text (str): String to which the function is to be applied, string

    Returns:
        Clean string without extra whitespaces
    ''' 
    return re.sub(r'^\s*|\s\s*', ' ', text).strip()

def remove_accented_chars_func(text):
    '''
    Removes all accented characters from a string, if present
    
    Args:
        text (str): String to which the function is to be applied, string
    
    Returns:
        Clean string without accented characters
    '''
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [None]:
trainC['premise'] = trainC.premise.apply(str.lower)
trainC['premise'] = trainC.premise.apply(remove_url_func)
trainC['premise'] = trainC.premise.apply(remove_punctuation_func)
trainC['premise'] = trainC.premise.apply(remove_accented_chars_func)
trainC['premise'] = trainC.premise.apply(remove_extra_whitespaces_func)

trainC['hypothesis'] = trainC.hypothesis.apply(str.lower)
trainC['hypothesis'] = trainC.hypothesis.apply(remove_url_func)
trainC['hypothesis'] = trainC.hypothesis.apply(remove_punctuation_func)
trainC['hypothesis'] = trainC.hypothesis.apply(remove_accented_chars_func)
trainC['hypothesis'] = trainC.hypothesis.apply(remove_extra_whitespaces_func)

Aplicando TFIDF e subtraindo as colunas de texto

In [None]:
def tfidf_transform(df, col1, col2):
    """
    Fit a TF-IDF transform on two columns of a pandas dataframe,
    and return the TF-IDF dictionary and the difference between
    the transformed columns.
    
    :param df: pandas dataframe
    :param col1: name of the first column to transform
    :param col2: name of the second column to transform
    :return: tuple containing the TF-IDF dictionary and the difference
             between the transformed columns
    """
    # Concatenate the two columns into a single Series
    text = df[col1].str.cat(df[col2], sep=' ')
    
    # Fit the TF-IDF transform
    tfidf = TfidfVectorizer()
    tfidf.fit(text)
    
    # Transform the individual columns
    col1_tfidf = tfidf.transform(df[col1])
    col2_tfidf = tfidf.transform(df[col2])
    
    # Calculate the difference between the transformed columns
    X = col1_tfidf - col2_tfidf
    
    return tfidf, X

# Apply the function to the dataframe
tfidf, X = tfidf_transform(trainC, 'premise', 'hypothesis')

Aplicando modelo

In [None]:
y = trainC['label']
Xpd = pd.DataFrame(X.toarray())

X_train, X_val, y_train, y_val = train_test_split(Xpd, y, test_size=0.15, random_state=42)

# Create the LGBMClassifier model
model = LGBMClassifier(objective='multiclass', num_class=3, n_jobs=-1, random_state=42)

# Define the early stopping criteria
eval_set = [(X_val, y_val)]
early_stopping_rounds = 10

# Train the model with early stopping
model.fit(X_train, y_train,
          callbacks=[early_stopping(100), log_evaluation(100)],
          eval_metric='logloss',
          eval_set=eval_set)

## Gerando submissão

In [None]:
x_subC = x_sub.copy()

In [None]:
tqdm.pandas(desc="Translation Progress")
x_subC['premise'] = x_subC.progress_apply(lambda row: translate_to_english(row['premise'],
                                                                           row['lang_abv']), axis=1)
x_subC['hypothesis'] = x_subC.progress_apply(lambda row: translate_to_english(row['hypothesis'],
                                                                              row['lang_abv']), axis=1)

In [None]:
x_subC['premise'] = x_subC.premise.apply(str.lower)
x_subC['premise'] = x_subC.premise.apply(remove_url_func)
x_subC['premise'] = x_subC.premise.apply(remove_punctuation_func)
x_subC['premise'] = x_subC.premise.apply(remove_accented_chars_func)
x_subC['premise'] = x_subC.premise.apply(remove_extra_whitespaces_func)

x_subC['hypothesis'] = x_subC.hypothesis.apply(str.lower)
x_subC['hypothesis'] = x_subC.hypothesis.apply(remove_url_func)
x_subC['hypothesis'] = x_subC.hypothesis.apply(remove_punctuation_func)
x_subC['hypothesis'] = x_subC.hypothesis.apply(remove_accented_chars_func)
x_subC['hypothesis'] = x_subC.hypothesis.apply(remove_extra_whitespaces_func)

In [None]:
x1 = tfidf.transform(x_subC['premise'])
x2 = tfidf.transform(x_subC['hypothesis'])

Xsub = x1 - x2
Xsub = pd.DataFrame(Xsub.toarray())

In [None]:
ID = x_sub.index.values
prevt = model.predict(Xsub)
submission = pd.DataFrame({'id' : ID,'prediction' : prevt })
submission.to_csv('submission.csv',index = False)
submission.describe()

# Abordagem 2: Pré treinado sem alteração

In [None]:
checkpoint = "symanto/xlm-roberta-base-snli-mnli-anli-xnli"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
trainer = Trainer(model, tokenizer=tokenizer)

In [None]:
train_ds = Dataset.from_pandas(train)
train_ds

In [None]:
def tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'])

In [None]:
train_ds = train_ds.map(tokenize, batched=True, remove_columns=list(train.columns))
train_ds

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(checkpoint).to(device)

Geração da Submissão

In [None]:
test_ds = Dataset.from_pandas(x_sub)
test_ds = test_ds.map(tokenize, batched=True, remove_columns=list(x_sub.columns))
preds_test = trainer.predict(test_ds).predictions.argmax(axis=-1)
preds_test.shape

In [None]:
submission = pd.DataFrame({'id': ID, 'prediction': preds_test})
submission

In [None]:
submission.to_csv('submission.csv', index=False)