In [49]:
import pandas as pd
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [50]:
df = pd.read_csv("../../data/procesed.csv", index_col=0)

In [51]:
df

Unnamed: 0,tweets,labels
0,"@user Escribió un libro resultón, con gracejo,...",2
1,@user Lo prometido es deuda. Aquí la foto: .,2
2,@user Bastante ñoña. Me jarté a llorar. De lo ...,2
3,"@user No sé nada acerca de eso, pero está clar...",2
4,"@user ¿En qué medio tienen su podcast esos, di...",2
...,...,...
14395,Lo que está ocurriendo hoy es una forma curios...,1
14396,"@user Gracias, bellísima! Nos debemos un café-...",1
14397,@user Es un análisis muy precipitado ese de qu...,1
14398,Hace días veo en redes cómo algunos se burlan ...,1


In [60]:
import spacy
nlp = spacy.load("es_core_news_md")
def clear_stopwords(tweet):
    return " ".join([token.lemma_ for token in nlp(tweet) 
                     if not token.is_stop
                     and not token.is_punct
                     and not token.text.lower() in ["@user","political_party", "politician", "hashtag", "user"]
                    ])

In [61]:
df["tweet_clean"] = df["tweets"].apply(clear_stopwords)

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer='word',
    max_features = 50_000,
    lowercase=True
)
X = vectorizer.fit_transform(df["tweet_clean"])

In [63]:
from torch.utils.data import DataLoader, Dataset

class PoliticESDataset(Dataset):
    def __init__(self, tweets, labels, vectorizer):
        self.tweets = tweets
        self.labels = labels
        self.vectorizer = vectorizer
    def __len__(self):
        return len(self.tweets)
    def __getitem__(self, item):
        return {
            'tweet': torch.tensor(self.vectorizer.transform([self.tweets[item]]).toarray()).to(torch.float32),
            'label': self.labels[item]
        }

def create_data_loader(df, vectorizer, batch_size = 16):
    return DataLoader(
        PoliticESDataset(
            tweets = df.tweets.to_numpy(),
            labels = df.labels.to_numpy(),
            vectorizer = vectorizer
        ),
        batch_size = batch_size,
        shuffle=True
    )

In [64]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

class TfIdfNetwork(torch.nn.Module):
    def __init__(self, input_size):
        super(TfIdfNetwork, self).__init__()
        self.linear = nn.Linear(input_size,1024).to(device)
        self.dropout = nn.Dropout(0.15).to(device)
        self.linear1 = nn.Linear(1024, 4).to(device)
    
    def forward(self, x):
        t = self.linear(x.to(torch.float32).to(device))
        t = self.dropout(t)
        t = self.linear1(t)
        return F.softmax(t, dim=1)


In [65]:
from tqdm import tqdm
def fit(model, loader, criterion, optimizer, total_steps):
    model = model.train()
    running_loss = 0.
    for d in tqdm(loader, total = total_steps/16):
        optimizer.zero_grad()
        pred = model(d["tweet"].squeeze())
        loss = criterion(pred.to(device), d["label"].to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss

def train(model, x, vectorizer, epochs=3, lr=1e-3):
    criterion = nn.CrossEntropyLoss()
    loader = create_data_loader(x, vectorizer)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(epochs):
        loss = fit(model, loader, criterion, optimizer, len(x))
        print(f'Loss: {loss}')

In [66]:
df_train = pd.DataFrame({
    "tweets": df["tweet_clean"],
    "labels": df["labels"]
})
model = TfIdfNetwork(X.shape[1])
train(model, df_train, vectorizer, epochs=8, lr=1e-4)

100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:20<00:00, 44.32it/s]


Loss: 1201.3125187158585


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:16<00:00, 54.96it/s]


Loss: 1141.0328485965729


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:16<00:00, 54.69it/s]


Loss: 1074.6670212745667


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:16<00:00, 54.15it/s]


Loss: 1012.0784994363785


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:16<00:00, 53.91it/s]


Loss: 942.3421971797943


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:16<00:00, 54.76it/s]


Loss: 886.6241592168808


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:16<00:00, 53.60it/s]


Loss: 835.1191130876541


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [00:16<00:00, 54.87it/s]

Loss: 794.7936396598816





In [73]:
def test(model, df_test, vectorizer):
    model = model.eval()
    criterion = nn.CrossEntropyLoss()
    preds = np.array([])
    loader = create_data_loader(df_test, vectorizer)
    running_loss = .0
    with torch.no_grad():
        for d in loader:
            pred = model(d["tweet"].squeeze())
            loss = criterion(pred, d["label"].to(device))
            running_loss += loss.item()
            preds = np.concatenate((preds, torch.argmax(pred.cpu(), dim=1).numpy()))
    return preds, running_loss

In [74]:
df_test = pd.read_csv("../../data/procesed_test.csv")
preds, test_loss = test(model, df_test, vectorizer)

In [75]:
from sklearn.metrics import classification_report

print(classification_report(preds, df_test["labels"]))

              precision    recall  f1-score   support

         0.0       0.43      0.38      0.41      1636
         1.0       0.42      0.30      0.35      1501
         2.0       0.01      0.18      0.02        38
         3.0       0.12      0.12      0.12       425

    accuracy                           0.32      3600
   macro avg       0.25      0.25      0.22      3600
weighted avg       0.39      0.32      0.35      3600

