In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import inflect
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ciril\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ciril\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ciril\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ciril\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
train_df = pd.read_csv(os.getcwd() + "\\" + "nlp-lab-dm23/train_processed.csv")
test_df = pd.read_csv(os.getcwd() + "\\" + "nlp-lab-dm23/test.csv")

In [31]:
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [32]:
q = inflect.engine()

def lowercase_text(text):
  return text.lower()

def remove_nums(text):
  return re.sub(r'\d+', '', text)

def replace_nums(text):
  # split strings into list of texts
  temp_string = text.split()
  # initialise empty list
  new_str = []

  for word in temp_string:
      # if text is a digit, convert the digit
      # to numbers and append into the new_str list
      if word.isdigit():
          temp = q.number_to_words(word)
          new_str.append(temp)
      # append the texts as it is
      else:
          new_str.append(word)

  # join the texts of new_str to form a string
  temp_str = ' '.join(new_str)
  return temp_str

def remove_punct(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

def remove_stopwords(text):
  stop_words = set(stopwords.words("english"))
  word_tokens = word_tokenize(text)
  filtered_text = [word for word in word_tokens if word not in stop_words]
  return filtered_text

def full_preprocessing_pipeline(df: DataFrame):
  res = []
  for i, row in tqdm(df.iterrows()):
    res.append(
      remove_stopwords(
        remove_punct(
          remove_nums(
            lowercase_text(
                row[2]
            )
          )
        )
      )
    )
  return res

In [33]:
preprocessed_train = full_preprocessing_pipeline(train_df)

120000it [00:45, 2625.90it/s]


In [34]:
preprocessed_train[0]

['hard',
 'overstate',
 'sense',
 'shock',
 'across',
 'much',
 'europe',
 'popular',
 'mandate',
 'americans',
 'given',
 'george',
 'w',
 'bush',
 'even',
 'result',
 'great',
 'surprise']

In [35]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

def lemmatize_words(texts):
  stemmed_text = []
  for text in tqdm(texts):
    stemmed_text.append([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text])
  return stemmed_text
texts_lem = lemmatize_words(preprocessed_train)

100%|██████████| 120000/120000 [15:25<00:00, 129.68it/s]


In [36]:
def preprocess_sample(text):
    t = remove_stopwords(
        remove_punct(
          remove_nums(
            lowercase_text(
                text
            )
          )
        )
      )
    return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in t])

train_df["Description"] = train_df["Description"].apply(lambda x: preprocess_sample(x))

In [37]:
train_df

Unnamed: 0,Class Index,Title,Description
0,1,We Have To Talk,hard overstate sense shock across much europe ...
1,2,"Boss, Pedro talk shop",sometimes george steinbrenner negotiate tuesda...
2,2,Crawford leads way as US goes 1-2-3 in 200m,finish partisan greek crowd want see shawn cra...
3,2,Ajax Amsterdam 0-1 Juventus: FT Report,amsterdam september champion league moment bri...
4,3,Krispy Kreme SEC probe widens,krispy kreme doughnut inc say friday inform se...
...,...,...,...
119995,2,A bit of a head-scratcher,many professional league recently defend champ...
119996,2,Fans Learning to Make Do Without NHL (AP),ap nhl disappear right eye reason simple
119997,2,Can Arroyo help Red Sox finally get a win?,curt schilling pedro martinez bring boston red...
119998,2,Woods May Stay No. 1 - for Now (AP),ap stewart cink atop leaderboard david tom lea...


In [92]:
train_df.to_csv("train_processed.csv")

In [3]:
train_labels = train_df["Class Index"]

In [4]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup, BertModel

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # "cuda:0" if torch.cuda.is_available() else 
device

device(type='cuda', index=0)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [7]:
texts = train_df.Description.values

In [40]:
print(' Original: ', texts[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(texts[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(texts[0])))

 Original:  hard overstate sense shock across much europe popular mandate american give george w bush even result great surprise
Tokenized:  ['hard', 'overs', '##tate', 'sense', 'shock', 'across', 'much', 'europe', 'popular', 'mandate', 'american', 'give', 'george', 'w', 'bush', 'even', 'result', 'great', 'surprise']
Token IDs:  [2524, 15849, 12259, 3168, 5213, 2408, 2172, 2885, 2759, 11405, 2137, 2507, 2577, 1059, 5747, 2130, 2765, 2307, 4474]


In [8]:
max_len = 0

# For every sentence...
for sent in texts:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  211


In [17]:
input_ids = []
attention_masks = []

# For every tweet...
for text in texts:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0).to(device)
attention_masks = torch.cat(attention_masks, dim=0).to(device)
labels = torch.tensor(train_labels).to(device)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])



Original:  hard overstate sense shock across much europe popular mandate american give george w bush even result great surprise
Token IDs: tensor([  101,  2524, 15849, 12259,  3168,  5213,  2408,  2172,  2885,  2759,
        11405,  2137,  2507,  2577,  1059,  5747,  2130,  2765,  2307,  4474,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

In [28]:
type(train_labels)

pandas.core.series.Series

In [18]:
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
#val_size = int(0.2 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

96,000 training samples
24,000 validation samples


In [19]:
# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [20]:
model = BertModel.from_pretrained('bert-base-uncased')

### MLP

In [21]:
import torch
import torch.nn as nn

class BertTextClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertTextClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        # print(outputs)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Пример создания модели
num_classes = 4
model = BertTextClassifier(model, num_classes)

In [23]:

print(f"Using {device} device")

model.to(device)

Using cuda:0 device


BertTextClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [24]:
import torch.optim as optim
from sklearn.metrics import f1_score

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1

# Пример обучения модели
for epoch in range(num_epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch, num_epochs))
    print(" Run train...")
    total_train_loss = 0
    model.train()
    losses = []
    #num_batches = int(120000 / 32)
    #i = 0
    for batch in tqdm(train_dataloader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = (batch[2] - 1).to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        losses.append(loss.item())
        #print(f"===batch {i} / {num_batches}; loss={loss.item()}===")
        #i+=1
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(" Average training loss: {0:.2f}".format(avg_train_loss))
    print(" Run validation...")
    model.eval()  # Переключитесь в режим оценки

    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in validation_dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = (batch[2] - 1).to(device)  # Перенос меток классов на GPU

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)

            true_labels.extend(labels.tolist())
            predicted_labels.extend(predicted.tolist())

    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    print(f'F1 Score: {f1:.2f}')

  0%|          | 0/3000 [00:00<?, ?it/s]

 Run train...
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-3.9894e-01,  1.6967e-01,  4.0876e-01,  ..., -4.0943e-01,
           2.3090e-01, -1.8144e-01],
         [-3.3319e-01, -9.2961e-02,  4.2549e-01,  ..., -4.0304e-01,
          -5.2819e-01, -1.4318e+00],
         [-1.2393e+00, -1.1252e+00,  4.0581e-01,  ..., -5.9366e-02,
          -2.9004e-02,  5.6807e-01],
         ...,
         [-1.0705e-02, -2.4713e-01,  6.2462e-01,  ...,  2.9314e-01,
          -4.4637e-02,  1.9462e-01],
         [-3.2231e-01,  2.3992e-02,  5.0950e-01,  ..., -3.8265e-01,
           1.0623e-01,  3.8347e-01],
         [-3.5926e-01, -4.2509e-02,  4.7702e-01,  ..., -2.9332e-01,
          -8.3375e-02,  2.1990e-01]],

        [[-3.1342e-01,  1.7539e-01,  2.2633e-01,  ..., -2.4062e-01,
           2.8576e-01,  1.3794e-01],
         [-2.7924e-02,  2.9835e-01,  3.9161e-01,  ..., -3.1312e-01,
          -1.9295e-01, -6.0143e-01],
         [-1.3553e-01,  2.4253e-02,  5.3023e-01,  ..., -2.4209e-01,

  0%|          | 1/3000 [00:03<3:07:25,  3.75s/it]

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.4496,  0.4914,  0.3827,  ..., -0.8102,  0.2032,  0.3080],
         [-0.1956,  0.5607, -0.0176,  ..., -0.7040,  0.2632,  0.3675],
         [-0.1311,  0.7139,  0.2514,  ..., -0.4234,  0.1726,  0.2230],
         ...,
         [-0.4228,  0.6145,  0.3387,  ..., -0.4881,  0.3812,  0.3442],
         [-0.4831,  0.7015,  0.4284,  ..., -0.5731,  0.3099,  0.3518],
         [-0.4086,  0.6216,  0.0813,  ..., -0.6994,  0.3641,  0.3581]],

        [[-0.8152,  0.6602,  0.1574,  ..., -0.6118,  0.4040,  0.2974],
         [-0.4668,  0.6424,  0.4994,  ..., -0.7978,  0.6254,  0.2586],
         [-0.6807,  0.5749,  0.2690,  ..., -0.3177,  0.5856,  0.1904],
         ...,
         [-0.5162,  0.4485,  0.3183,  ..., -0.7389,  0.6072,  0.3887],
         [-0.7303,  0.4924,  0.4599,  ..., -0.4797,  0.5398,  0.2308],
         [-0.6482,  0.6335,  0.2340,  ..., -0.6696,  0.5809,  0.1585]],

        [[-0.3419,  0.3441,  0.2800,  ..., -0.6394,  

  0%|          | 2/3000 [00:09<3:36:00,  4.32s/it]

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.5106,  0.2307,  0.4182,  ..., -0.4790,  0.0035, -0.7862],
         [-0.4735,  0.2913,  0.2804,  ..., -0.4450,  0.0165, -0.4992],
         [-0.4569,  0.0236,  0.3113,  ..., -0.7188,  0.2208, -0.5913],
         ...,
         [-0.3639,  0.2911,  0.2655,  ..., -0.5514,  0.1282, -0.6637],
         [-0.4132,  0.3662,  0.1953,  ..., -0.4218, -0.0139, -0.7560],
         [-0.1363,  0.2538,  0.1911,  ..., -0.4529,  0.0788, -0.5026]],

        [[-0.3934,  0.2196,  0.2120,  ..., -0.4150, -0.0448, -0.6392],
         [-0.5309,  0.1935,  0.0425,  ..., -0.4986,  0.0277, -0.6323],
         [-0.4116,  0.3324,  0.2659,  ..., -0.6163, -0.0515, -0.8130],
         ...,
         [-0.3075,  0.2302,  0.2688,  ..., -0.5912, -0.0263, -0.5214],
         [-0.4851,  0.2677,  0.3137,  ..., -0.2891, -0.0877, -0.7322],
         [-0.3546,  0.2073,  0.2900,  ..., -0.5685, -0.0307, -0.8269]],

        [[-0.4754,  0.2283,  0.2421,  ..., -0.6157,  

  0%|          | 3/3000 [00:15<4:00:40,  4.82s/it]

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 1.3236e-01, -1.2858e-01, -8.5733e-02,  ...,  7.4365e-02,
           3.4285e-01, -7.1154e-01],
         [-1.0660e-01, -1.2485e-01,  1.2212e-02,  ...,  3.0618e-01,
           6.2294e-01, -7.2592e-01],
         [ 9.1593e-02, -3.1201e-02, -1.9815e-01,  ...,  1.3373e-01,
           7.0032e-01, -5.9425e-01],
         ...,
         [ 1.9110e-01, -2.0945e-01, -7.5123e-02,  ...,  7.3915e-02,
           4.8619e-01, -6.7490e-01],
         [ 1.1539e-01, -2.9408e-01, -1.2366e-02,  ...,  2.8366e-02,
           6.0603e-01, -5.5437e-01],
         [ 2.1291e-02, -1.9263e-01, -7.2525e-03,  ...,  1.7899e-01,
           5.6406e-01, -6.4944e-01]],

        [[-4.7534e-02, -2.2609e-02, -2.2521e-01,  ..., -2.3097e-01,
           5.7376e-01, -2.3188e-01],
         [ 1.0828e-01,  2.4911e-02,  7.8951e-03,  ...,  2.5623e-02,
           6.4471e-01, -7.2866e-01],
         [ 1.7649e-01, -5.5356e-02, -2.3794e-02,  ..., -2.9597e-03,
           5.

KeyboardInterrupt: 

In [62]:
torch.cuda.empty_cache()