<a href="https://colab.research.google.com/github/klordo/nlp_homeworks/blob/hw4/nlp_hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Библиотеки и установки

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download ru_core_news_sm
!pip install fasttext
!pip install torchmetrics

In [3]:
import pandas as pd
import numpy as np
import spacy
import nltk
import torch
import torch.nn as nn

from collections import Counter
from tqdm import tqdm
from pathlib import Path
from torchmetrics import F1Score
from torch.utils.data import DataLoader, Dataset
from torch.optim import Optimizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [None]:
nltk.download("punkt")

In [None]:
%%bash
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz .
gunzip cc.ru.300.bin.gz

In [6]:
RANDOM_STATE = 1000 - 7

# Подготовка датасета, train и test данных


In [23]:
df_true = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

In [24]:
df_true.sample(2)

Unnamed: 0,title,text,subject,date
51,"House gives final approval to tax bill, delive...",WASHINGTON (Reuters) - The Republican-controll...,politicsNews,"December 19, 2017"
5303,U.S. education secretary calls Obama transgend...,WASHINGTON (Reuters) - U.S. Education Secretar...,politicsNews,"February 23, 2017"


In [25]:
df_fake.sample(2)

Unnamed: 0,title,text,subject,date
6466,Conservative Christians Sob Into Their Coffee...,Right after Ted Cruz suspended his presidentia...,News,"May 9, 2016"
5450,There’s A New Majority Religious Voting Bloc ...,A new report from the Pew Research Center sho...,News,"July 16, 2016"


In [None]:
df_fake = df_fake[['text']]
df_fake['label'] = 0

df_true = df_true[['text']]
df_true['label'] = 1

In [27]:
df_true['text'] = df_true.text.apply(lambda text: ' '.join(text.split()[3:]))

In [28]:
df_true.sample(2)

Unnamed: 0,text,label
10486,German Economy Minister Sigmar Gabriel has bra...,1
3211,U.S. President Donald Trump has given Defense ...,1


In [None]:
df = pd.concat((df_true, df_fake))

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
df['cleaned_text'] = df['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)
df.sample(5)

Unnamed: 0,text,label,cleaned_text
12094,Caddell s remarks came after hearing Michelle ...,0,caddell s remark come hear michelle obama s in...
18312,Wary of German Chancellor Angela Merkel s ambi...,1,wary german chancellor angela merkel s ambitio...
20140,"In NBC s attempt to discredit Trump, they ve s...",0,nbc s attempt discredit trump ve shine bright ...
17107,European Union leaders will promise during tal...,1,european union leader promise talk brussels su...
19193,A car bomber attacked a Danish convoy belongin...,1,car bomber attack danish convoy belong nato le...


In [None]:
df = df[df['cleaned_text'] != '']

In [None]:
cleaned_text, label =  df['cleaned_text'], df['label']

In [None]:
def get_tokens(text_corpus):
    tokens = []
    for text in tqdm(text_corpus):
        tokens.extend(word_tokenize(text))
    tokens_filtered = [word for word in tokens if word.isalnum()]
    return tokens_filtered

In [None]:
tokens_filtered = get_tokens(cleaned_text.values)

100%|██████████| 44267/44267 [00:45<00:00, 979.42it/s]


In [None]:
max_words = 2000
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words - 1)]

In [None]:
vocabulary = {id: token for token, id in dict(enumerate(tokens_filtered_top, 1)).items()}

In [None]:
def find_max_len(counter, threshold):
    sum_count = sum(counter.values())
    cum_count = 0
    for i in range(max(counter)):
        cum_count += counter[i]
        if cum_count > sum_count * threshold:
            return i
    return max(counter)

text_len_counter = Counter()
for text in cleaned_text:
    text_len_counter[len(text)] += 1

threshold = 0.80
MAX_WORD_LEN = find_max_len(text_len_counter, threshold)

print('Max text length for {:.0%} of words is {}'.format(threshold, MAX_WORD_LEN))

Max text length for 80% of words is 2170


In [None]:
max_len = 2170

In [None]:
def text_to_sequence(text, maxlen, vocabulary):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return result[-maxlen:] + padding

In [None]:
text_tokenized = np.array([text_to_sequence(text, max_len, vocabulary) for text in cleaned_text.values], dtype=np.int32)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                      text_tokenized, label,
                                      train_size=0.8,
                                      random_state=RANDOM_STATE,
                                      stratify=label
                                  )

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
                                      X_train, y_train,
                                      train_size=0.8,
                                      random_state=RANDOM_STATE,
                                      stratify=y_train
                                  )

In [None]:
y_train, y_val, y_test = y_train.values, y_val.values, y_test.values

In [None]:
class TextDataWrapper(Dataset):
    def __init__(self, data, target=None, transform=None):
        self.data = torch.from_numpy(data).long()
        if target is not None:
            self.target = torch.from_numpy(target).long()
        else:
          self.target = None
        self.transform = transform

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index] if self.target is not None else -1

        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.data)

# Обучение моделей

In [None]:
def train(model: nn.Module,
          data_loader: DataLoader,
          optimizer: Optimizer,
          loss_fn, f1
          ) -> tuple[float, float, float]:

    model.to(device)
    model.train()
    temp_loss = []
    temp_metrics = []
    acc_metrics = []

    for x, y in tqdm(train_loader, desc='Train'):
        optimizer.zero_grad()

        x, y = x.to(device), y.to(device)
        output = model(x)

        loss = loss_fn(output, y)

        temp_loss.append(loss.item())
        temp_metrics.append(f1(output.argmax(1), y).item())
        acc_metrics.append((output.argmax(1) == y).sum().item()/output.shape[0])

        loss.backward()
        optimizer.step()

    epoch_acc = np.array(acc_metrics).mean()
    epoch_loss = np.array(temp_loss).mean()
    epoch_f1 = np.array(temp_metrics).mean()

    return epoch_acc, epoch_loss, epoch_f1

@torch.inference_mode()
def evaluate(model: nn.Module,
             data_loader: DataLoader,
             loss_fn,
             f1
             ) -> tuple[float, float]:

    model.to(device)
    model.eval()

    temp_loss = []
    temp_metrics = []
    acc_metrics = []

    for x, y in tqdm(data_loader, desc='Evaluation'):
        x, y = x.to(device), y.to(device)
        output = model(x)

        loss = loss_fn(output, y)

        temp_loss.append(loss.float().item())
        temp_metrics.append(f1(output.argmax(1), y).item())
        acc_metrics.append((output.argmax(1) == y).sum().item()/output.shape[0])

    epoch_loss = np.array(temp_loss).mean()
    epoch_f1 = np.array(temp_metrics).mean()
    epoch_acc = np.array(acc_metrics).mean()

    return epoch_acc, epoch_loss, epoch_f1

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
pad_idx = 0

## CNN

In [None]:
class ConvTextClassifier(nn.Module):
    def __init__(self, vocab_size=2000, embedding_dim=128, out_channel=128, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, out_channel, kernel_size=3)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(out_channel, num_classes)

    def forward(self, x):
        output = self.embedding(x)       # bs, len, emb_dim
        output = output.permute(0, 2, 1) # bs, emb_dim, len
        output = self.conv(output)
        output = self.relu(output)
        output = torch.max(output, axis=2).values
        output = self.linear(output)
        return output

In [None]:
batch_size = 256

train_dataset = TextDataWrapper(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = TextDataWrapper(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TextDataWrapper(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
model_cnn = ConvTextClassifier()
model_cnn = model_cnn.to(device)
print(model_cnn)
print("Parameters:", sum([param.nelement() for param in model_cnn.parameters()]))

ConvTextClassifier(
  (embedding): Embedding(2000, 128)
  (conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (linear): Linear(in_features=128, out_features=2, bias=True)
)
Parameters: 305538


In [None]:
f1 = F1Score(task="binary")
f1 = f1.to(device)
optimizer = torch.optim.Adam(model_cnn.parameters(), lr=10e-3)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
epochs = 10

for epoch in range(1,epochs+1):

    print(f"\nTrain epoch {epoch}/{epochs}\n")

    epoch_acc, epoch_loss, epoch_f1 = train(model=model_cnn,
                                            data_loader=train_loader,
                                            optimizer=optimizer,
                                            loss_fn=criterion,
                                            f1=f1)
    print(f'\nTRAIN: Acc: {epoch_acc:}, Loss: {epoch_loss}, f1 score: {epoch_f1:}\n')

    epoch_acc, epoch_loss, epoch_f1 = evaluate(model=model_cnn,
                                               data_loader=val_loader,
                                               loss_fn=criterion,
                                               f1=f1)
    print(f'\nTRAIN: Acc: {epoch_acc:}, Loss: {epoch_loss}, f1 score: {epoch_f1}\n')


Train epoch 1/10



Train: 100%|██████████| 111/111 [00:14<00:00,  7.45it/s]



TRAIN: Acc: 0.4840744236883943, Loss: 0.014630547951926186, f1 score: 0.6456700434555879



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 38.78it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 2/10



Train: 100%|██████████| 111/111 [00:07<00:00, 14.24it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 38.56it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 3/10



Train: 100%|██████████| 111/111 [00:07<00:00, 13.90it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 37.79it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 4/10



Train: 100%|██████████| 111/111 [00:07<00:00, 13.91it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 35.47it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 5/10



Train: 100%|██████████| 111/111 [00:08<00:00, 13.65it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 37.30it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 6/10



Train: 100%|██████████| 111/111 [00:08<00:00, 13.46it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 35.02it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 7/10



Train: 100%|██████████| 111/111 [00:08<00:00, 13.38it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 36.54it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 8/10



Train: 100%|██████████| 111/111 [00:08<00:00, 13.08it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 36.08it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 9/10



Train: 100%|██████████| 111/111 [00:08<00:00, 12.98it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 34.06it/s]



TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981


Train epoch 10/10



Train: 100%|██████████| 111/111 [00:08<00:00, 12.77it/s]



TRAIN: Acc: 0.48372250927397986, Loss: 0.0, f1 score: 0.6515175796843864



Evaluation: 100%|██████████| 28/28 [00:00<00:00, 35.27it/s]


TRAIN: Acc: 0.48378351477652465, Loss: 0.0, f1 score: 0.6516479998826981






## LSTM

In [None]:
class RNNLanguageModel(nn.Module):
    def __init__(self, n_tokens=2000, emb_size=128, hid_size=256, num_classes=2):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=n_tokens, embedding_dim=emb_size)
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hid_size, batch_first=True)
        self.linear = nn.Linear(hid_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, (hn, cn) = self.lstm(x)
        x = self.linear(x)
        x = x[:, -1, :]
        return x

In [None]:
batch_size = 256

train_dataset = TextDataWrapper(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = TextDataWrapper(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TextDataWrapper(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
model_lstm = RNNLanguageModel()
model_lstm = model_lstm.to(device)
print(model_lstm)
print("Parameters:", sum([param.nelement() for param in model_lstm.parameters()]))

RNNLanguageModel(
  (embedding): Embedding(2000, 128)
  (lstm): LSTM(128, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=2, bias=True)
)
Parameters: 651778


In [None]:
f1 = F1Score(task="binary")
f1 = f1.to(device)
optimizer = torch.optim.Adam(model_lstm.parameters(), lr=10e-3)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
epochs = 10

for epoch in range(1,epochs+1):

    print(f"\nTrain epoch {epoch}/{epochs}\n")

    epoch_acc, epoch_loss, epoch_f1 = train(model=model_lstm,
                                            data_loader=train_loader,
                                            optimizer=optimizer,
                                            loss_fn=criterion,
                                            f1=f1)
    print(f'\nTRAIN: Acc: {epoch_acc:.2}, Loss: {epoch_loss}, f1 score: {epoch_f1:.3}\n')

    epoch_acc, epoch_loss, epoch_f1 = evaluate(model=model_lstm,
                                               data_loader=val_loader,
                                               loss_fn=criterion,
                                               f1=f1)
    print(f'\nTRAIN: Acc: {epoch_acc:.2}, Loss: {epoch_loss}, f1 score: {epoch_f1:.3}\n')


Train epoch 1/10



Train: 100%|██████████| 111/111 [00:53<00:00,  2.07it/s]



TRAIN: Acc: 0.48, Loss: 0.006188033071348779, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.25it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 2/10



Train: 100%|██████████| 111/111 [00:51<00:00,  2.15it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.22it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 3/10



Train: 100%|██████████| 111/111 [00:51<00:00,  2.13it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.21it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 4/10



Train: 100%|██████████| 111/111 [00:51<00:00,  2.14it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.21it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 5/10



Train: 100%|██████████| 111/111 [00:52<00:00,  2.13it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.23it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 6/10



Train: 100%|██████████| 111/111 [00:52<00:00,  2.11it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.21it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 7/10



Train: 100%|██████████| 111/111 [00:51<00:00,  2.14it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.22it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 8/10



Train: 100%|██████████| 111/111 [00:52<00:00,  2.13it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.24it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 9/10



Train: 100%|██████████| 111/111 [00:51<00:00,  2.14it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.16it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


Train epoch 10/10



Train: 100%|██████████| 111/111 [00:52<00:00,  2.12it/s]



TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652



Evaluation: 100%|██████████| 28/28 [00:04<00:00,  6.21it/s]


TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652






### Сравнение

CNN

In [11]:
cnn_acc, cnn_loss, cnn_f1 = evaluate(model=model_cnn,
                                     data_loader=test_loader,
                                     loss_fn=criterion,
                                     f1=f1)
print(f'\nTRAIN: Acc: {cnn_acc}, Loss: {cnn_loss}, f1 score: {cnn_f1:.3}\n')

Evaluation: 100%|██████████| 35/35 [00:01<00:00, 34.48it/s]
TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652


LSTM

In [None]:
lstm_acc, lstm_loss, lstm_f1 = evaluate(model=model_lstm,
                                        data_loader=test_loader,
                                        loss_fn=criterion,
                                        f1=f1)
print(f'\nTRAIN: Acc: {lstm_acc:.2}, Loss: {lstm_loss}, f1 score: {lstm_f1:.3}\n')

Evaluation: 100%|██████████| 35/35 [00:05<00:00,  6.24it/s]


TRAIN: Acc: 0.48, Loss: 0.0, f1 score: 0.652






Вывод:

Точность вышла очень похожая, но lstm работает медленнее