In [1]:
import pandas as pd

In [6]:
df = pd.read_csv("../../data/procesed.csv", index_col=0)

In [7]:
df

Unnamed: 0,tweets,labels
0,"@user Escribió un libro resultón, con gracejo,...",2
1,@user Lo prometido es deuda. Aquí la foto: .,2
2,@user Bastante ñoña. Me jarté a llorar. De lo ...,2
3,"@user No sé nada acerca de eso, pero está clar...",2
4,"@user ¿En qué medio tienen su podcast esos, di...",2
...,...,...
14395,Lo que está ocurriendo hoy es una forma curios...,1
14396,"@user Gracias, bellísima! Nos debemos un café-...",1
14397,@user Es un análisis muy precipitado ese de qu...,1
14398,Hace días veo en redes cómo algunos se burlan ...,1


In [3]:
import spacy
nlp = spacy.load("es_core_news_md")
def clear_stopwords(tweet):
    return " ".join([token.text.lower() for token in nlp(tweet) 
                     if not token.is_stop
                     and not token.is_punct
                     and not token.text.lower() in ["@user","political_party", "politician", "hashtag", "user"]
                    ])

In [8]:
df["tweet_clean"] = df["tweets"].apply(clear_stopwords)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer='word',
    max_features = 50_000,
    lowercase=True
)
X = vectorizer.fit_transform(df["tweet_clean"])
#X_test

In [43]:
X

<14400x33584 sparse matrix of type '<class 'numpy.float64'>'
	with 165522 stored elements in Compressed Sparse Row format>

In [163]:
from torch.utils.data import DataLoader, Dataset

class PoliticESDataset(Dataset):
    def __init__(self, tweets, labels, vectorizer):
        self.tweets = tweets
        self.labels = labels
        self.vectorizer = vectorizer
    def __len__(self):
        return len(self.tweets)
    def __getitem__(self, item):
        return {
            'tweet': torch.tensor(self.vectorizer.transform([self.tweets[item]]).toarray()).to(torch.float32),
            'label': self.labels[item]
        }

def create_data_loader(df, vectorizer, batch_size = 16):
    return DataLoader(
        PoliticESDataset(
            tweets = df.tweets.to_numpy(),
            labels = df.labels.to_numpy(),
            vectorizer = vectorizer
        ),
        batch_size = batch_size,
        shuffle=True
    )

In [167]:
import numpy as np
from torch import nn
import torch.nn.functional as F

class TfIdfNetwork(torch.nn.Module):
    def __init__(self, input_size):
        super(TfIdfNetwork, self).__init__()
        self.linear = nn.Linear(input_size,1024)
        self.dropout = nn.Dropout(0.15)
        self.linear1 = nn.Linear(1024, 4)
    
    def forward(self, x):
        t = self.linear(x.to(torch.float32))
        t = self.dropout(t)
        t = self.linear1(t)
        return F.softmax(t, dim=1)


In [179]:
from tqdm import tqdm
def fit(model, loader, criterion, optimizer, total_steps):
    model = model.train()
    running_loss = 0.
    for d in tqdm(loader, total = total_steps/16):
        optimizer.zero_grad()
        pred = model(d["tweet"].squeeze())
        loss = criterion(pred, d["label"])
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss

def train(model, x, vectorizer, epochs=3, lr=5e-3):
    criterion = nn.CrossEntropyLoss()
    loader = create_data_loader(x, vectorizer)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(epochs):
        loss = fit(model, loader, criterion, optimizer, len(x))
        print(f'Loss: {loss}')

In [180]:
model = TfIdfNetwork(X.shape[1])
train(model, df_train, vectorizer)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 900/900.0 [01:05<00:00, 13.80it/s]


Loss: 1166.0590681433678


  0%|                                                                                                                                                            | 0/900.0 [00:00<?, ?it/s]


TypeError: 'float' object is not callable

In [91]:
df_train = pd.DataFrame({
    "tweets": df["tweet_clean"],
    "labels": df["labels"]
})