In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
import torch
from pathlib import Path

In [2]:
df = pd.read_csv('../../data/twitter_training.csv', header=None)
df.drop(columns=[0,1], inplace=True)
df.columns = ['label', 'text']

labels = sorted(df.label.unique())
label2id = dict(zip(labels, range(len(labels))))
df['target'] = df.label.apply(lambda x: label2id[x])
df.dropna(inplace=True)
# df.head()
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(df.text)
X = X.toarray()
X[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [3]:
config = {'model': 'ai-forever/sbert_large_nlu_ru'}

tokenizer = AutoTokenizer.from_pretrained(config["model"])
model_hf = AutoModel.from_pretrained(config["model"])

In [13]:
def create_embedding(text):
    
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        embeddings = model_hf(**inputs).last_hidden_state[:, 0, :].numpy()
    return embeddings

In [18]:
create_embedding(df.text[:10].values.tolist())

array([[ 4.2679447e-01, -2.8372285e-01, -2.2269835e-01, ...,
         3.2076749e-01, -1.2642201e+00,  4.4107616e-02],
       [ 3.7005451e-01, -1.2414413e-01, -2.4223654e-01, ...,
         3.6250138e-01, -8.2888925e-01,  2.5851291e-02],
       [ 4.0417799e-01, -3.2004377e-01, -2.2525784e-01, ...,
         4.8512909e-01, -1.2319142e+00,  3.7426554e-02],
       ...,
       [ 4.8449391e-01,  3.7058136e-01, -1.4294991e-01, ...,
         2.6181722e-01, -1.2537019e+00,  4.4144008e-01],
       [ 6.4889485e-01,  2.1009718e-04, -4.7704369e-01, ...,
         3.7981066e-01, -9.0882063e-01,  2.5194368e-01],
       [ 3.9653441e-01,  1.6655812e-01, -2.6362908e-01, ...,
         3.0342647e-01, -1.1687939e+00,  4.6506247e-01]], dtype=float32)

In [16]:
df.text[:10].values

array(['im getting on borderlands and i will murder you all ,',
       'I am coming to the borders and I will kill you all,',
       'im getting on borderlands and i will kill you all,',
       'im coming on borderlands and i will murder you all,',
       'im getting on borderlands 2 and i will murder you me all,',
       'im getting into borderlands and i can murder you all,',
       "So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg",
       "So I spent a couple of hours doing something for fun... If you don't know that I'm a huge @ Borderlands fan and Maya is one of my favorite characters, I decided to make a wallpaper for my PC.. Here's the original picture compared to the creation I made:) Have fun! pic.twitter.com / mLsI5wf9Jg",
       "So I spent