In [1]:
! pip install ipdb

In [2]:
! pip install torchmetrics

In [3]:
!wget https://www.dropbox.com/s/tn6x5f4ybaj34zf/Fake.csv?dl=0 -O data.csv

In [4]:
import pandas as pd
import numpy as np
from string import punctuation
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import ipdb

import re
import torchmetrics
from torchmetrics import F1
from torchmetrics.functional import f1, recall
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('data.csv')

In [6]:
df.head(3)

In [7]:
def preprocess(text):
    res = re.sub(r'[^A-z]', ' ', text)
    res = res.lower()
    return res

In [8]:
df['text'] = df.text.apply(preprocess)

In [9]:
df["subject"].value_counts()

In [10]:
coded = {"subject":     {"News": 0, 
                                "politics": 1,
                                "left-news": 2,
                                "Government News": 3,
                                "US_News": 4,
                                "Middle-east": 5}}

In [11]:
df = df.replace(coded)

In [12]:
df.head(3)

In [13]:
train_sentences, val_sentences = train_test_split(df, test_size=0.2)

In [14]:
vocab = Counter()

for text in df['text']:
    text = text.split()
    vocab.update(text)
#print('всего уникальных токенов:', len(vocab))

filtered_vocab = set()
for word in vocab:
    if vocab[word] > 5:
        filtered_vocab.add(word)
#print('уникальных токенов, встретившихся больше 5 раз:', len(filtered_vocab))

In [15]:
word2id = {'PAD':0}

for word in filtered_vocab:
    word2id[word] = len(word2id)

#обратный словарь для того, чтобы раскодировать последовательность
id2word = {i:word for word, i in word2id.items()}

In [16]:
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE

In [17]:
class Dataset(Dataset):
    def __init__(self, dataset, col, target_col, word2id, max_len, DEVICE):
        self.dataset = dataset[col].values
        self.word2id = word2id
        self.length = dataset.shape[0]
        self.target = torch.Tensor(dataset[target_col].values)
        self.max_len = max_len
        self.device = DEVICE
    
    def __len__(self):
        return self.length

    def __getitem__(self, index): 
        tokens = self.dataset[index].split()
        ids = torch.LongTensor([self.word2id[token] if token in self.word2id else self.word2id['PAD'] for token in tokens][:self.max_len])
        y = [self.target[index]]
        return ids, y

    def collate_fn(self, batch):
      ids, y = list(zip(*batch))
      padded_ids = torch.vstack([F.pad(seq, pad=(0, self.max_len - seq.shape[0]), mode='constant', value=0) for seq in ids])
      padded_ids = pad_sequence(ids, batch_first=True).to(self.device)
      y = torch.LongTensor(y).to(self.device)
      return padded_ids, y.T[0]

In [18]:
train_dataset = Dataset(train_sentences, 'text', 'subject', word2id, 400, DEVICE)
train_sampler = RandomSampler(train_dataset)
train_iterator = DataLoader(train_dataset, collate_fn = train_dataset.collate_fn, sampler=train_sampler, batch_size=1024)

In [19]:
val_dataset = Dataset(val_sentences, 'text', 'subject', word2id, 400, DEVICE)
val_sampler = SequentialSampler(val_dataset)
val_iterator = DataLoader(val_dataset, collate_fn = val_dataset.collate_fn, sampler=val_sampler, batch_size=1024)

In [20]:
! wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [21]:
import gensim

w2v = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [22]:
weights = np.zeros((len(word2id), 300))

for word, i in word2id.items():
    try:
        weights[i] = w2v[word]
    except KeyError:
        weights[i] = np.random.uniform(-0.25, 0.25, 300)

weights = torch.FloatTensor(weights)

Использовала код на tensorflow отсюда: https://github.com/KifayatMsd/C-LSTM-text-classification/blob/master/clstm_classifier.py

In [23]:
class clstm_clf(nn.Module):
  def __init__(self, max_length, vocab_size, filter_list, drop_first):
    super().__init__()
    self.vocab_size = vocab_size
    self.filter_list = filter_list
    self.drop_first = drop_first
    self.max_length = max_length
    
    self.embedding = nn.Embedding(vocab_size, 300)
    self.embedding.from_pretrained(torch.tensor(weights))
    
    self.conv2 = nn.Conv1d(in_channels=300, out_channels=150, kernel_size=2)
    self.conv3 = nn.Conv1d(in_channels=300, out_channels=150, kernel_size=3)
    self.conv4 = nn.Conv1d(in_channels=300, out_channels=150, kernel_size=4)

    self.convs = [self.conv2, self.conv3, self.conv4]
    
    self.lstm = nn.LSTM(input_size=150, hidden_size=150, num_layers=1, batch_first=True)
    self.dropout = nn.Dropout(p=0.5)            
    self.relu = nn.ReLU()
    self.out = nn.Softmax(dim=1)
    self.linear = nn.Linear(150, 6)

  def forward(self, text):
    embedded = self.dropout(self.embedding(text)).transpose(1, 2)
    max_len = self.max_length - max(self.filter_list) + 1
    
    outputs = []
    for item in range(2):
      layer = self.convs[item](embedded)
      hid = self.relu(layer)[:, :, :max_len]
      outputs.append(hid)

    if len(self.filter_list) > 1:
      rnn_inputs = torch.cat(outputs, -1)
    else:
      rnn_inputs = hid

    _, (hidden_state, _) = self.lstm(rnn_inputs.transpose(1, 2))
    if self.drop_first:
      embedded = self.dropout(embedded)
    logits = self.out(self.linear(torch.squeeze(hidden_state, 0)))
        
    return logits

In [24]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0 # для подсчета среднего лосса на всех батчах

    model.train()  # ставим модель в обучение, явно указываем, что сейчас надо будет хранить градиенты у всех весов

    for i, (texts, ys) in enumerate(iterator): #итерируемся по батчам
        optimizer.zero_grad()  #обнуляем градиенты
        preds = model(texts)  #прогоняем данные через модель
        loss = criterion(preds, ys) #считаем значение функции потерь  
        loss.backward() #считаем градиенты  
        optimizer.step() #обновляем веса 
        epoch_loss += loss.item() #сохраняем значение функции потерь
        if not (i + 1) % int(len(iterator)/5):
            print(f'Train loss: {epoch_loss/i}')      
    return  epoch_loss / len(iterator) # возвращаем среднее значение лосса по всей выборке

In [25]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_metric = 0
    model.eval() 
    with torch.no_grad():
        for i, (texts, ys) in enumerate(iterator):   
            preds = model(texts)  # делаем предсказания на тесте
            loss = criterion(preds, ys)   # считаем значения функции ошибки для статистики  
            epoch_loss += loss.item()
            batch_metric = f1(preds.argmax(1).long(), ys.long(), ignore_index=0)
            epoch_metric += batch_metric
            if i != 0:
                if not (i + 1) % int(len(iterator)/5):
                  #print(f'длина итератора: ', len(iterator))
                  print(f'Val loss: {epoch_loss/i}, Val f1: {epoch_metric/i}')
        
    return epoch_metric / len(iterator), epoch_loss / len(iterator) # возвращаем среднее значение по всей выборке

In [26]:
def learning(n_epochs, model, optimizer, criterion):
    for i in range(n_epochs):
        print(f'\nstarting Epoch {i}')
        print('Training...')
        epoch_loss = train(model, train_iterator, optimizer, criterion)
        losses.append(epoch_loss)
        print('\nEvaluating on train...')
        f1_on_train,_ = evaluate(model, train_iterator, criterion)
        f1s.append(f1_on_train.cpu())
        print('\nEvaluating on test...')
        f1_on_test, epoch_loss_on_test = evaluate(model, val_iterator, criterion)
        losses_eval.append(epoch_loss_on_test)
        f1s_eval.append(f1_on_test.cpu())

In [27]:
model = clstm_clf(max_length=400, vocab_size=len(word2id), filter_list=[2], drop_first=0)
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0001)
criterion = nn.CrossEntropyLoss()
criterion(torch.tensor([0.45, 0.22, 0.32]).unsqueeze(1).unsqueeze(0), torch.tensor([1]).unsqueeze(0).long())


In [28]:
losses = []
losses_eval = []
f1s = []
f1s_eval = []

for i in range(5):
    print(f'\nstarting Epoch {i}')
    print('Training...')
    epoch_loss = train(model, train_iterator, optimizer, criterion)
    losses.append(epoch_loss)
    print('\nEvaluating on train...')
    f1_on_train,_ = evaluate(model, train_iterator, criterion)
    f1s.append(f1_on_train)
    print('\nEvaluating on test...')
    f1_on_test, epoch_loss_on_test = evaluate(model, val_iterator, criterion)
    losses_eval.append(epoch_loss_on_test)
    f1s_eval.append(f1_on_test)

 Подбор гиперпараметров

In [29]:
filters = [[2], [3]]
drop_first = [0, 1]
best_f1 = 0

for filter in filters:
  for d in drop_first:
    losses_2 = []
    losses_eval_2 = []
    f1s_2 = []
    f1s_eval_2 = []
    for i in range(5):
        print('filter= ' + str(filter) + 'drop_first= ' + str(drop_first))
        model = clstm_clf(max_length=400, vocab_size=len(word2id), filter_list=filter, drop_first=d)
        optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001, weight_decay=0.0001)
        criterion = nn.CrossEntropyLoss()
        print(f'\nstarting Epoch {i}')
        print('Training...')
        epoch_loss = train(model, train_iterator, optimizer, criterion)
        losses_2.append(epoch_loss)
        print('\nEvaluating on train...')
        f1_on_train,_ = evaluate(model, train_iterator, criterion)
        f1s_2.append(f1_on_train)
        print('\nEvaluating on test...')
        f1_on_test, epoch_loss_on_test = evaluate(model, val_iterator, criterion)
        losses_eval.append(epoch_loss_on_test)
        f1s_eval_2.append(f1_on_test)
        if f1_on_test > best_f1:
          best_f1 = f1_on_test
          print('new best_f1 reached')

Наилучшее качество достигнуто с фильтром [2] и drop_first 0. В целом качество модели не очень хорошее, возможно, потому что было мало эпох для обучения (простите, больше колаб не тянул и вырубался). Можно бы было еще поэкспериментировать с гиперпараметрами, но памяти хватило только на такое. Но вообще видно, что модель чему-то да учится (лосс уменьшается, качество растет), хотя и не очень успешно (прям как я на этом курсе). 