In [122]:
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

In [113]:
df_tweets = pd.read_json('../../data/embedded_tweets.jl', lines=True, dtype=False)
df_train = pd.read_json('../../data/embedded_train_150.jl', lines=True)

In [114]:
df_train.loc[df_train["target"] == "NA", ["target"]] = 0  # added

df_train['target'] = df_train['target'].astype(str)
df_train = df_train.rename(columns={'id':'id_str'})
df_train['id_str'] = df_train['id_str'].astype(str)

In [115]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

split_df = pd.DataFrame(df_train["embedding"].tolist())
df_train = pd.concat([split_df, df_train], axis=1)

le = LabelEncoder()
le.fit(df_train["target"])
df_train["target"] = df_train["target"].apply(lambda x: le.transform([x])[0])

additional = ["id_str", "created_at", "full_text", "embedding", "target"]
x = df_train.drop(additional, axis=1).values
y = df_train.loc[:, ["target"]].values

x = StandardScaler().fit_transform(x)

pca = PCA(213)
pca.fit(x)

PCA(n_components=213)

In [116]:
split_df = pd.DataFrame(df_tweets["embedding"].tolist())
temp_df = pd.concat([split_df, df_tweets], axis=1)

additional = ["id_str", "created_at", "download_datetime", "full_text", "favorite_count", "in_reply_to_screen_name", "lang", "quote_count", "reply_count", "retweet_count", "user_id_str", "embedding"]

x = temp_df.drop(additional, axis=1).values
# x = pca.transform(x)

df_tweets["embedding"] = pd.DataFrame(x).values.tolist()

## Model Section

In [117]:
import torch
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
from typing import Optional, Tuple
from sklearn.metrics import f1_score, accuracy_score, classification_report
import torch.nn as nn


class MLP(pl.LightningModule):
  
  def __init__(self, input_dim, hidden_dim, output_dim, lr=1e-4):
    super().__init__()
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim
    self.lr = lr
    
    self.layers = nn.Sequential(
      nn.Linear(input_dim, hidden_dim),
      nn.ReLU(),
      # nn.Linear(hidden_dim, hidden_dim),
      # nn.ReLU(),
      nn.Linear(hidden_dim, out_features=output_dim),
      nn.Softmax(dim=1)
    )
    self.loss = nn.CrossEntropyLoss()

  def forward(self, x):
    return self.layers(x)
  
  def training_step(self, batch, batch_idx):
    x, y = batch
    y = y.flatten().type(torch.LongTensor)
    y_hat = self(x)
    loss = self.loss(y_hat, y)
    self.log('train_loss', loss, on_epoch=True, on_step=False)
    return {"loss": loss, "logits": y_hat.detach(), "gold": y, "batch_idx": batch_idx}

  def validation_step(self, batch, batch_idx):
    x, y = batch
    y = y.flatten().type(torch.LongTensor)
    y_hat = self(x)
    loss = self.loss(y_hat, y)
    y_hat = torch.argmax(self(x), 1)
    f1 = f1_score(y.cpu(), y_hat.cpu(), average='micro')
    acc = accuracy_score(y.cpu(), y_hat.cpu())
    self.log("val_loss", loss, prog_bar=True)
    self.log("val_f1_micro", f1, prog_bar=True)
    self.log("val_acc", acc, prog_bar=True)
    return loss

  def test_step(self, batch, batch_idx):
    x, y = batch
    y = y.flatten().type(torch.LongTensor)
    y_hat = self(x)
    y_hat = torch.argmax(self(x), 1)

    report = classification_report(y, y_hat, output_dict=True)
    self.log_dict(report)
    return report

  def training_epoch_end(self, outputs):
      batch_size = self.trainer.datamodule.get_batch_size()
      sample_map = self.trainer.datamodule.get_sample_map()

      data = {"guid": [], f"logits_epoch_{self.current_epoch}": [], "gold": []}
      for batch in outputs:
        batch_idx = batch["batch_idx"]
        curr_batch_size = len(batch["logits"])
        data["guid"] += [sample_map[batch_size*batch_idx + idx] for idx in range(curr_batch_size)]
        data[f"logits_epoch_{self.current_epoch}"] += batch["logits"].tolist()
        data["gold"] += batch["gold"].tolist()

      df = pd.DataFrame(data)
      df.to_json(f"./training_dynamics/dynamics_epoch_{self.current_epoch}.jsonl", lines=True, orient='records')

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.lr)

  def predict_step(
    self,
    batch,
    batch_idx: int,
    dataloader_idx: Optional[int] = None,
  ) -> Tuple[torch.Tensor, torch.Tensor]:
    x, y = batch
    z = self(x)
    return z, y

In [118]:
# mlp = MLP.load_from_checkpoint(checkpoint_path=f'../../data/mlp/mlp_newdata_3c_pca_213_213_bs=32_lr-4.ckpt', input_dim=213, hidden_dim=213, output_dim=3)
mlp = MLP.load_from_checkpoint(checkpoint_path=f'../../data/mlp/test_no_pca.ckpt', input_dim=768, hidden_dim=768, output_dim=3)

In [119]:
def predict(model, X, le):
    model.eval()
    with torch.no_grad():
        preds = model(X)
    preds = torch.argmax(preds, 1)
    return le.inverse_transform(preds)

In [120]:
def label_predict(x):
    xt = torch.tensor(x).unsqueeze(0)
    return predict(mlp, xt, le)[0]

In [123]:
df_tweets["target"] = df_tweets["embedding"].progress_apply(label_predict)

100%|██████████| 92668/92668 [07:39<00:00, 201.53it/s] 


In [124]:
df_tweets.drop("embedding", inplace=True, axis=1)
# df_tweets.to_csv("../../data/classified_tweets.tsv", sep="\t")
df_tweets.to_csv("../../data/classified_tweets_no_pca.tsv", sep="\t")

In [127]:
df_tweets["target"].value_counts()

1     60302
-1    17417
0     14949
Name: target, dtype: int64