<a href="https://colab.research.google.com/github/klordo/nlp_homeworks/blob/hw4/nlp_hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Библиотеки и установки

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download ru_core_news_sm
!pip install fasttext
!pip install torchmetrics

In [2]:
import pandas as pd
import numpy as np
import spacy
import nltk
import torch
import torch.nn as nn

from collections import Counter
from tqdm import tqdm
from pathlib import Path
from torchmetrics import F1Score
from torch.utils.data import DataLoader, Dataset
from torch.optim import Optimizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [None]:
nltk.download("punkt")

In [4]:
RANDOM_STATE = 1000 - 7

# Подготовка датасета, train и test данных


In [5]:
df_true = pd.read_csv('True.csv')
df_fake = pd.read_csv('Fake.csv')

In [6]:
df_true.shape[0]

21417

In [7]:
df_fake.shape[0]

23481

In [8]:
df_true = df_true.sample(500, random_state=RANDOM_STATE)
df_fake = df_fake.sample(500, random_state=RANDOM_STATE)

In [9]:
df_true.sample(2)

Unnamed: 0,title,text,subject,date
15271,Hilltop tribe's bitterness a challenge for Lib...,"BANI WALID, Libya (Reuters) - Elders of a powe...",worldnews,"November 9, 2017"
18655,German police arrest suspect over alleged supe...,BERLIN (Reuters) - German police said on Satur...,worldnews,"September 30, 2017"


In [10]:
df_fake.sample(2)

Unnamed: 0,title,text,subject,date
18698,WIFE OF BRITISH MEDIC Calls Radio Show…Says Hu...,A caller to British radio host Katie Hopkins w...,left-news,"May 3, 2017"
19065,PRESIDENT TRUMP: Nancy Pelosi Is Helping To El...,"In a sit-down interview with Fox and Friends, ...",left-news,"Feb 28, 2017"


In [11]:
df_fake = df_fake[['text']]
df_fake['label'] = 0

df_true = df_true[['text']]
df_true['label'] = 1

In [12]:
df_true['text'] = df_true.text.apply(lambda text: ' '.join(text.split()[3:]))

In [13]:
df_true.sample(2)

Unnamed: 0,text,label
11655,The U.N. independent investigator into human r...,1
9949,Legislation to help Puerto Rico climb out of a...,1


In [14]:
df = pd.concat((df_true, df_fake))

In [None]:
nlp = spacy.load("en_core_web_sm")

In [16]:
%%time
df['cleaned_text'] = df['text'].apply(
    lambda x: ' '.join(
        token.lemma_.lower() for token in nlp(x) if
        not token.is_stop
        and not token.is_punct
        and not token.is_digit
        and not token.like_email
        and not token.like_num
        and not token.is_space
    )
)
df.sample(5)

CPU times: user 1min 11s, sys: 258 ms, total: 1min 11s
Wall time: 1min 12s


Unnamed: 0,text,label,cleaned_text
9000,Republicans cheered after the U.S. Supreme Cou...,1,republicans cheer u.s. supreme court thursday ...
11355,An Egyptian court sentenced a British woman to...,1,egyptian court sentence british woman year pri...
14994,Thank goodness these Muslim women have a frien...,0,thank goodness muslim woman friend socialist r...
11454,Peru s former President Alberto Fujimori was r...,1,peru s president alberto fujimori rush prison ...
14428,"There never seems to be a shortage of smug, kn...",0,shortage smug know self righteous effeminate m...


In [17]:
df = df[df['cleaned_text'] != '']

In [18]:
cleaned_text, label =  df['cleaned_text'], df['label']

In [19]:
def get_tokens(text_corpus):
    tokens = []
    for text in tqdm(text_corpus):
        tokens.extend(word_tokenize(text))
    tokens_filtered = [word for word in tokens if word.isalnum()]
    return tokens_filtered

In [None]:
tokens_filtered = get_tokens(cleaned_text.values)

In [21]:
max_words = 2000
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words - 1)]

In [22]:
vocabulary = {id: token for token, id in dict(enumerate(tokens_filtered_top, 1)).items()}

In [23]:
def find_max_len(counter, threshold):
    sum_count = sum(counter.values())
    cum_count = 0
    for i in range(max(counter)):
        cum_count += counter[i]
        if cum_count > sum_count * threshold:
            return i
    return max(counter)

text_len_counter = Counter()
for text in cleaned_text:
    text_len_counter[len(text)] += 1

threshold = 0.80
MAX_WORD_LEN = find_max_len(text_len_counter, threshold)

print('Max text length for {:.0%} of words is {}'.format(threshold, MAX_WORD_LEN))

Max text length for 80% of words is 2190


In [24]:
max_len = 2170

In [25]:
def text_to_sequence(text, maxlen, vocabulary):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return result[-maxlen:] + padding

In [26]:
text_tokenized = np.array([text_to_sequence(text, max_len, vocabulary) for text in cleaned_text.values], dtype=np.int32)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
                                      text_tokenized, label,
                                      train_size=0.8,
                                      random_state=RANDOM_STATE,
                                      stratify=label
                                  )

In [28]:
X_train, X_val, y_train, y_val = train_test_split(
                                      X_train, y_train,
                                      train_size=0.8,
                                      random_state=RANDOM_STATE,
                                      stratify=y_train
                                  )

In [29]:
y_train, y_val, y_test = y_train.values, y_val.values, y_test.values

In [30]:
class TextDataWrapper(Dataset):
    def __init__(self, data, target=None, transform=None):
        self.data = torch.from_numpy(data).long()
        if target is not None:
            self.target = torch.from_numpy(target).long()
        else:
          self.target = None
        self.transform = transform

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index] if self.target is not None else -1

        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return len(self.data)

# Обучение моделей

In [31]:
def train(model: nn.Module,
          data_loader: DataLoader,
          optimizer: Optimizer,
          loss_fn, f1
          ) -> tuple[float, float, float]:

    model.to(device)
    model.train()
    temp_loss = []
    temp_metrics = []
    acc_metrics = []

    for x, y in tqdm(train_loader, desc='Train'):
        optimizer.zero_grad()

        x, y = x.to(device), y.to(device)
        output = model(x)

        loss = loss_fn(output, y)

        temp_loss.append(loss.item())
        temp_metrics.append(f1(output.argmax(1), y).item())
        acc_metrics.append((output.argmax(1) == y).sum().item()/output.shape[0])

        loss.backward()
        optimizer.step()

    epoch_acc = np.array(acc_metrics).mean()
    epoch_loss = np.array(temp_loss).mean()
    epoch_f1 = np.array(temp_metrics).mean()

    return epoch_acc, epoch_loss, epoch_f1

@torch.inference_mode()
def evaluate(model: nn.Module,
             data_loader: DataLoader,
             loss_fn,
             f1
             ) -> tuple[float, float]:

    model.to(device)
    model.eval()

    temp_loss = []
    temp_metrics = []
    acc_metrics = []

    for x, y in tqdm(data_loader, desc='Evaluation'):
        x, y = x.to(device), y.to(device)
        output = model(x)

        loss = loss_fn(output, y)

        temp_loss.append(loss.float().item())
        temp_metrics.append(f1(output.argmax(1), y).item())
        acc_metrics.append((output.argmax(1) == y).sum().item()/output.shape[0])

    epoch_loss = np.array(temp_loss).mean()
    epoch_f1 = np.array(temp_metrics).mean()
    epoch_acc = np.array(acc_metrics).mean()

    return epoch_acc, epoch_loss, epoch_f1

In [32]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## CNN

In [33]:
class ConvTextClassifier(nn.Module):
    def __init__(self, vocab_size=2000, embedding_dim=128, out_channel=128, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, out_channel, kernel_size=3)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(out_channel, num_classes)

    def forward(self, x):
        output = self.embedding(x)       # bs, len, emb_dim
        output = output.permute(0, 2, 1) # bs, emb_dim, len
        output = self.conv(output)
        output = self.relu(output)
        output = torch.max(output, axis=2).values
        output = self.linear(output)
        return output

In [34]:
batch_size = 256

train_dataset = TextDataWrapper(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = TextDataWrapper(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TextDataWrapper(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [35]:
model_cnn = ConvTextClassifier()
model_cnn = model_cnn.to(device)
print(model_cnn)
print("Parameters:", sum([param.nelement() for param in model_cnn.parameters()]))

ConvTextClassifier(
  (embedding): Embedding(2000, 128)
  (conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,))
  (relu): ReLU()
  (linear): Linear(in_features=128, out_features=2, bias=True)
)
Parameters: 305538


In [36]:
f1 = F1Score(task="binary")
f1 = f1.to(device)
optimizer = torch.optim.Adam(model_cnn.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

In [37]:
epochs = 10

for epoch in range(1,epochs+1):

    print(f"\nTrain epoch {epoch}/{epochs}\n")

    epoch_acc, epoch_loss, epoch_f1 = train(model=model_cnn,
                                            data_loader=train_loader,
                                            optimizer=optimizer,
                                            loss_fn=criterion,
                                            f1=f1)
    print(f'\nAcc: {epoch_acc:.5}, Loss: {epoch_loss:.5}, f1 score: {epoch_f1:.5}\n')

    epoch_acc, epoch_loss, epoch_f1 = evaluate(model=model_cnn,
                                               data_loader=val_loader,
                                               loss_fn=criterion,
                                               f1=f1)
    print(f'\nAcc: {epoch_acc:.5}, Loss: {epoch_loss:.5}, f1 score: {epoch_f1:.5}\n')


Train epoch 1/10



Train: 100%|██████████| 3/3 [00:07<00:00,  2.66s/it]



Acc: 0.47717, Loss: 1.6233, f1 score: 0.21693



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 43.71it/s]



Acc: 0.51572, Loss: 1.4698, f1 score: 0.67511


Train epoch 2/10



Train: 100%|██████████| 3/3 [00:00<00:00, 15.82it/s]



Acc: 0.5428, Loss: 1.1514, f1 score: 0.69123



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 53.10it/s]



Acc: 0.79874, Loss: 0.44774, f1 score: 0.76119


Train epoch 3/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.63it/s]



Acc: 0.71988, Loss: 0.50662, f1 score: 0.56736



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 50.34it/s]



Acc: 0.81132, Loss: 0.43667, f1 score: 0.77612


Train epoch 4/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.84it/s]



Acc: 0.90443, Loss: 0.24786, f1 score: 0.91014



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 51.94it/s]



Acc: 0.71698, Loss: 0.5999, f1 score: 0.78049


Train epoch 5/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.69it/s]



Acc: 0.88785, Loss: 0.2449, f1 score: 0.9033



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 54.41it/s]



Acc: 0.89937, Loss: 0.26853, f1 score: 0.89744


Train epoch 6/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.49it/s]



Acc: 0.94627, Loss: 0.13349, f1 score: 0.94326



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 50.46it/s]



Acc: 0.87421, Loss: 0.35106, f1 score: 0.86111


Train epoch 7/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.36it/s]



Acc: 0.97135, Loss: 0.088278, f1 score: 0.96955



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 53.18it/s]



Acc: 0.91195, Loss: 0.21204, f1 score: 0.91667


Train epoch 8/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.57it/s]



Acc: 0.99349, Loss: 0.050999, f1 score: 0.99359



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 57.84it/s]



Acc: 0.86164, Loss: 0.28201, f1 score: 0.87778


Train epoch 9/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.75it/s]



Acc: 0.99219, Loss: 0.044949, f1 score: 0.99206



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 56.94it/s]



Acc: 0.95597, Loss: 0.15809, f1 score: 0.95706


Train epoch 10/10



Train: 100%|██████████| 3/3 [00:00<00:00, 17.97it/s]



Acc: 1.0, Loss: 0.012101, f1 score: 1.0



Evaluation: 100%|██████████| 1/1 [00:00<00:00, 56.94it/s]


Acc: 0.9434, Loss: 0.17351, f1 score: 0.94194






## LSTM

In [38]:
class RNNLanguageModel(nn.Module):
    def __init__(self, n_tokens=2000, emb_size=128, hid_size=256, num_classes=2):
        super().__init__()

        self.embedding = nn.Embedding(num_embeddings=n_tokens, embedding_dim=emb_size)
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=hid_size, batch_first=True)
        self.linear = nn.Linear(hid_size, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, (hn, cn) = self.lstm(x)
        x = x.mean(dim=1)
        x = self.linear(x)
        return x

In [39]:
batch_size = 256

train_dataset = TextDataWrapper(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = TextDataWrapper(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

test_dataset = TextDataWrapper(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [40]:
model_lstm = RNNLanguageModel()
model_lstm = model_lstm.to(device)
print(model_lstm)
print("Parameters:", sum([param.nelement() for param in model_lstm.parameters()]))

RNNLanguageModel(
  (embedding): Embedding(2000, 128)
  (lstm): LSTM(128, 256, batch_first=True)
  (linear): Linear(in_features=256, out_features=2, bias=True)
)
Parameters: 651778


In [41]:
f1 = F1Score(task="binary")
f1 = f1.to(device)
optimizer = torch.optim.Adam(model_lstm.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

In [42]:
epochs = 10

for epoch in range(1,epochs+1):

    print(f"\nTrain epoch {epoch}/{epochs}\n")

    epoch_acc, epoch_loss, epoch_f1 = train(model=model_lstm,
                                            data_loader=train_loader,
                                            optimizer=optimizer,
                                            loss_fn=criterion,
                                            f1=f1)
    print(f'\nAcc: {epoch_acc:.5}, Loss: {epoch_loss:.5}, f1 score: {epoch_f1:.5}\n')

    epoch_acc, epoch_loss, epoch_f1 = evaluate(model=model_lstm,
                                               data_loader=val_loader,
                                               loss_fn=criterion,
                                               f1=f1)
    print(f'\nAcc: {epoch_acc:.5}, Loss: {epoch_loss:.5}, f1 score: {epoch_f1:.5}\n')


Train epoch 1/10



Train: 100%|██████████| 3/3 [00:02<00:00,  1.44it/s]



Acc: 0.47587, Loss: 1.7611, f1 score: 0.21636



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.50it/s]



Acc: 0.50314, Loss: 0.7252, f1 score: 0.66946


Train epoch 2/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.61it/s]



Acc: 0.51111, Loss: 0.755, f1 score: 0.67618



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.54it/s]



Acc: 0.50314, Loss: 0.73089, f1 score: 0.66946


Train epoch 3/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.62it/s]



Acc: 0.46233, Loss: 0.71371, f1 score: 0.59452



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  9.21it/s]



Acc: 0.49686, Loss: 0.69218, f1 score: 0.45946


Train epoch 4/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.59it/s]



Acc: 0.50156, Loss: 0.69546, f1 score: 0.27258



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.94it/s]



Acc: 0.49686, Loss: 0.69282, f1 score: 0.0


Train epoch 5/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.53it/s]



Acc: 0.48889, Loss: 0.69368, f1 score: 0.0



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  7.86it/s]



Acc: 0.49686, Loss: 0.69043, f1 score: 0.0


Train epoch 6/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.60it/s]



Acc: 0.48889, Loss: 0.68624, f1 score: 0.0



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.94it/s]



Acc: 0.55975, Loss: 0.674, f1 score: 0.31373


Train epoch 7/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.60it/s]



Acc: 0.67951, Loss: 0.66127, f1 score: 0.52961



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.68it/s]



Acc: 0.72327, Loss: 0.6538, f1 score: 0.78


Train epoch 8/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.59it/s]



Acc: 0.84097, Loss: 0.60278, f1 score: 0.85823



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.94it/s]



Acc: 0.7044, Loss: 0.60936, f1 score: 0.70807


Train epoch 9/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.57it/s]



Acc: 0.80365, Loss: 0.56153, f1 score: 0.81695



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.92it/s]



Acc: 0.65409, Loss: 0.65537, f1 score: 0.66667


Train epoch 10/10



Train: 100%|██████████| 3/3 [00:01<00:00,  2.58it/s]



Acc: 0.79922, Loss: 0.52923, f1 score: 0.80862



Evaluation: 100%|██████████| 1/1 [00:00<00:00,  8.94it/s]


Acc: 0.67925, Loss: 0.65089, f1 score: 0.64828






### Сравнение

CNN

In [45]:
cnn_acc, cnn_loss, cnn_f1 = evaluate(model=model_cnn,
                                     data_loader=test_loader,
                                     loss_fn=criterion,
                                     f1=f1)
print(f'\nAcc: {cnn_acc:.2}, Loss: {cnn_loss:.2}, f1 score: {cnn_f1:.2}\n')

Evaluation: 100%|██████████| 1/1 [00:00<00:00, 26.92it/s]


Acc: 0.97, Loss: 0.12, f1 score: 0.97






LSTM

In [46]:
lstm_acc, lstm_loss, lstm_f1 = evaluate(model=model_lstm,
                                        data_loader=test_loader,
                                        loss_fn=criterion,
                                        f1=f1)
print(f'\nAcc: {lstm_acc:.2}, Loss: {lstm_loss:.2}, f1 score: {lstm_f1:.2}\n')

Evaluation: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]


Acc: 0.75, Loss: 0.62, f1 score: 0.74






Модель CNN выдает заметно лучший результат по всем показателям.

Такой результат возможен из-за паддингов. Их может быть слишком много и в усреднении в LSTM их не стоит учитывать.