In [None]:
import pandas as pd
import spacy

df_train= pd.read_csv("politicES_phase_2_train_public.csv", on_bad_lines='skip')
df_test= pd.read_csv("politicES_phase_2_test_codalab.csv", on_bad_lines='skip')

In [None]:
# Agrupar los tweets por 'label' y 'ideology_multiclass' y luego concatenar los tweets
df_train= df_train.groupby(['label', 'ideology_multiclass'])['tweet'].apply(' '.join).reset_index()
df_test= df_test.groupby(['label', 'ideology_multiclass'])['tweet'].apply(' '.join).reset_index()

In [None]:
df_train.drop('label', axis=1, inplace=True)
df_test.drop('label', axis=1, inplace=True)

In [None]:
!pip install -U spacy



In [None]:
!python -m spacy download es_core_news_lg

Collecting es-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_lg-3.7.0/es_core_news_lg-3.7.0-py3-none-any.whl (568.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.0/568.0 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: es-core-news-lg
Successfully installed es-core-news-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load('es_core_news_lg')

def preprocess_text(tweet):
    # Procesa el tweet usando el modelo de spaCy
    doc = nlp(tweet)
    # Lematiza y elimina stopwords
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

In [None]:
df_train['clean_tweet'] = df_train['tweet'].apply(preprocess_text)
df_test['clean_tweet'] = df_test['tweet'].apply(preprocess_text)

In [None]:
df_train.to_csv('train_tokenized.csv')
df_test.to_csv('test_tokenized.csv')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df_train= pd.read_csv('/content/drive/My Drive/TFM/train_tokenized.csv')
df_test= pd.read_csv('/content/drive/My Drive/TFM/test_tokenized.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_train['ideology_multiclass'] = label_encoder.fit_transform(df_train['ideology_multiclass'])
df_test['ideology_multiclass'] = label_encoder.fit_transform(df_test['ideology_multiclass'])

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import LongformerModel, LongformerTokenizer

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
model.eval()

LongformerModel(
  (embeddings): LongformerEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0-11): 12 x LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
    

In [7]:
import numpy as np

encodings = tokenizer(df_train['clean_tweet'].tolist(), return_tensors="pt", padding='max_length', max_length=4096, truncation=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Ajuste el tamaño del lote
batch_size = 2  # Ajusta según la capacidad de tu GPU

# Función para procesar los textos en lotes
def batch_encode(model, inputs, batch_size=2):
    batch_start = 0
    batch_end = batch_size
    all_embeddings = []

    while batch_start < len(inputs['input_ids']):
        batch_inputs = {k: v[batch_start:batch_end] for k, v in inputs.items()}
        batch_inputs = {k: v.to(device) for k, v in batch_inputs.items()}
        with torch.no_grad():
            outputs = model(**batch_inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            all_embeddings.append(embeddings.cpu().numpy())
        batch_start += batch_size
        batch_end += batch_size

    return np.vstack(all_embeddings)

# Llama a la función
embeddings = batch_encode(model, encodings, batch_size)


In [10]:

y = df_train['ideology_multiclass']

# Dividir el conjunto de entrenamiento para validación (10% para test)
X_train, X_val, y_train, y_val = train_test_split(embeddings, y, test_size=0.3, random_state=42)

# Definir varios modelos para probar
modelos = {
    'XGB': xgb.XGBClassifier(),
    'Regresión Logística': LogisticRegression(),
    'SVM': SVC(),

}

# Entrenar y evaluar cada modelo
for nombre, modelo in modelos.items():
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_val)
    macro_f1 = f1_score(y_val, y_pred, average='macro')
    print(f"{nombre} - Macro F1 Score: {macro_f1}")

XGB - Macro F1 Score: 0.841227224572884


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Regresión Logística - Macro F1 Score: 0.49501625794711374
SVM - Macro F1 Score: 0.12583148558758314


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score


# Definir el modelo y los parámetros para GridSearchCV
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Definir los parámetros para el Grid Search
param_grid = {
    'max_depth': [3, 5, 7, None],  # Profundidades del árbol
    'learning_rate': [0.01, 0.1, 0.2],  # Tasa de aprendizaje
    'n_estimators': [100, 200],  # Número de árboles
    'subsample': [0.8, 1]  # Porcentaje de muestras usadas por árbol
}
# Crear un GridSearchCV
clf = GridSearchCV(model, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)

# Ajustar el modelo
clf.fit(X_train, y_train)

# Mejores parámetros y mejor score
print("Mejores parámetros:", clf.best_params_)
print("Mejor puntuación de cross-validation (macro F1):", clf.best_score_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


  pid = os.fork()
  pid = os.fork()


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

# Definir el modelo y los parámetros para GridSearchCV
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

In [10]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model2 = LongformerModel.from_pretrained('allenai/longformer-base-4096')
model2.eval()

encodings_test = tokenizer(df_test['clean_tweet'].tolist(), return_tensors="pt", padding='max_length', max_length=4096, truncation=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2 = model2.to(device)


In [None]:
X_test = batch_encode(model2, encodings_test, batch_size)
y_test = df_test['ideology_multiclass']

In [13]:
y_pred = model.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("Macro F1 en el conjunto de prueba:", macro_f1)
print(classification_report(y_test, y_pred))

Macro F1 en el conjunto de prueba: 0.40615058976933327
              precision    recall  f1-score   support

           0       0.43      0.37      0.40       117
           1       0.50      0.69      0.58       210
           2       0.45      0.42      0.43       153
           3       0.56      0.13      0.22        67

    accuracy                           0.48       547
   macro avg       0.49      0.40      0.41       547
weighted avg       0.48      0.48      0.45       547

