In [1]:
! pip install transformers


Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 30.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [2]:
import numpy as np
import string
from nltk.tokenize import WordPunctTokenizer
from string import digits, ascii_lowercase, punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
import torch
from gensim.models import KeyedVectors
import pickle
import nltk

from collections import Counter

import pandas as pd

from tqdm import tqdm


# Данные и препроцессинг

In [3]:
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/messages_dev.tsv
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/messages_train.tsv
! wget https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/goldstandard_dev_2022.tsv

--2022-02-13 13:09:32--  https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/messages_dev.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131326 (128K) [text/plain]
Saving to: ‘messages_dev.tsv’


2022-02-13 13:09:33 (6.30 MB/s) - ‘messages_dev.tsv’ saved [131326/131326]

--2022-02-13 13:09:33--  https://raw.githubusercontent.com/jeka-e/WASSA2022_EMO/main/messages_train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1100753 (1.0M) [text/plain]
Saving to: ‘messages_train.tsv’


2022-02-13 13:09:33 (21.0 MB/s) - ‘me

In [4]:
all_data_train = pd.read_csv('messages_train.tsv', sep="\t")
goldstandard = pd.read_csv('goldstandard_dev_2022.tsv', sep="\t", names=[str(i) for i in range(12)])

all_data_val = pd.read_csv('messages_dev.tsv', sep="\t")
x_val = all_data_val[['essay']]
y_val = goldstandard[['2']]  # only emotion label column
y_val.columns = ['emotion']
# x_val['emotion'] = y_val

x_train = all_data_train[['essay']] # leave columns related to this track
y_train = all_data_train[['emotion']] # only emotion label column

In [5]:
em2id = {'neutral': 0, 
         'sadness': 1,
         'anger': 2, 
         'fear': 3, 
         'surprise': 4, 
         'disgust': 5, 
         'joy': 6
}

id2em = {i:word for word, i in em2id.items()}

y_val = np.array(y_val['emotion'].map(em2id))
y_train = np.array(y_train['emotion'].map(em2id))


# Векторизация

## FastText

In [6]:
ft_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/fasttext_en/wiki-news-300d-1M.vec')
# загружается сто лет, около 5 минут... (даже на гпу)


In [7]:
def clean(text): 

    #lowercase the text
    text = text.lower()
    
    #remove punctuations
    text = "".join([w for w in text if not w in string.punctuation])
    
    #split the text
    # text = text.split()
    
    #remove stopwords
    # text = " ".join([word for word in text if not word in stop_words])
    
    return text

In [8]:
def get_emb_ft(sent, model):
    count = 0
    sent_vector = np.zeros(300)
    for word in sent.split():
        try:
            sent_vector += ft_model.wv[word]
            count += 1
        except KeyError:
            # oov zero vector
            continue
    return sent_vector/count

In [9]:
x_train_ft = x_train['essay'].apply(clean)
x_val_ft = x_val['essay'].apply(clean)

In [10]:
weights_train_ft = np.zeros((len(x_train_ft), 300))

for i, sent in enumerate(tqdm(x_train['essay'])):
    sent_vector = get_emb_ft(sent, ft_model)
    weights_train_ft[i] = sent_vector

  
100%|██████████| 1860/1860 [00:01<00:00, 1015.26it/s]


In [11]:
weights_val_ft = np.zeros((len(x_val_ft), 300))

for i, sent in enumerate(tqdm(x_val['essay'])):
    sent_vector = get_emb_ft(sent, ft_model)
    weights_val_ft[i] = sent_vector

  
100%|██████████| 270/270 [00:00<00:00, 881.09it/s]


In [12]:
with open('weights_val_ft.pickle', 'wb') as f:
    pickle.dump(weights_val_ft, f)

with open('weights_train_ft.pickle', 'wb') as f:
    pickle.dump(weights_train_ft, f)


## BERT

In [13]:
bert_model = AutoModel.from_pretrained('bhadresh-savani/bert-base-uncased-emotion')
tokenizer = AutoTokenizer.from_pretrained('bhadresh-savani/bert-base-uncased-emotion')

Downloading:   0%|          | 0.00/935 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at bhadresh-savani/bert-base-uncased-emotion were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [14]:
def get_emb_bert(sent, model, tokenizer):
    with torch.no_grad():
      enc = tokenizer(sent, return_tensors='pt')
      output = model(**enc, return_dict=True)
    # vector = output.last_hidden_state[:,0,:].numpy()   #CLS token
    vector = torch.mean(output.last_hidden_state, dim=1)
    return np.squeeze(vector)

In [15]:
weights_train_bert = np.zeros((len(x_train), 768))

for i, sent in enumerate(tqdm(x_train['essay'])):
    sent_vector = get_emb_bert(sent, bert_model, tokenizer)
    weights_train_bert[i] = sent_vector

100%|██████████| 1860/1860 [13:27<00:00,  2.30it/s]


In [16]:
weights_val_bert = np.zeros((len(x_val), 768))

for i, sent in enumerate(tqdm(x_val['essay'])):
    sent_vector = get_emb_bert(sent, bert_model, tokenizer)
    weights_val_bert[i] = sent_vector

100%|██████████| 270/270 [01:52<00:00,  2.41it/s]


In [17]:
with open('weights_train_bert.pickle', 'wb') as f:
    pickle.dump(weights_train_bert, f)

with open('weights_val_bert.pickle', 'wb') as f:
    pickle.dump(weights_val_bert, f)
