## Библиотеки + небольшой пример

In [1]:
!pip install transformers -q

In [27]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import pandas as pd
import os

# Toxic

## Загрузка модели + небольшой пример

In [188]:
model_name = 'SkolkovoInstitute/russian_toxicity_classifier'
toxic_class = pipeline('text-classification', model=model_name)

In [82]:
a =toxic_class('ты супер')
a[0]['label'], a[0]['score']

('POSITIVE', 0.9479933977127075)

## Чтение набора данных

In [39]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/Авалон/Диплом/data')

In [78]:
#coffee = pd.read_csv('coffee.csv', nrows=100)
#coffee_prep = pd.read_csv('coffee_prep.csv', nrows=20)
phrases = pd.read_csv('phrases.csv', nrows=1000)

## Функция для определения токсичности + пример применения

In [43]:
def get_toxic(x):
  a =toxic_class(x, gp)[0]
  if a['label'] == 'toxic':
    return a['score']
  else:
    return 1-a['score']


In [130]:
a = phrases['text'].apply(get_toxic)
phrases[a>0.5]['text']

247                   от выкладки товара тоже в восторге
372                                   ничего интересного
391    а ещё в середине вечера к нам подошла девушка-...
442                                    комфортные номера
544                                   праздновали юбилей
728                                   немного расстроило
997                                               бургер
Name: text, dtype: object

Из 1000 фраз менее 1% токсичности. Смысла выделать подмножество данных нет.

# Positive/negative/neutral

## Загрузка модели + пример использования CPU

In [197]:
sent_class = pipeline('text-classification', model='MonoHime/rubert-base-cased-sentiment-new')

In [198]:
a =sent_class('ты супер', top_k=None)
b = {name: a[ind]['score'] for name, ind in zip(('neutral', 'positive', 'negative'), range(3))}
b

{'neutral': 0.9479933977127075,
 'positive': 0.041179995983839035,
 'negative': 0.010826491750776768}

In [203]:
def get_sentiment(x):
  tmp = sent_class(x, top_k=None)
  return (tmp[i]['score'] for i in range(3))

In [128]:
coffee['text'][2]

'Не очень удобное расположение, от метро идти мин 20 быстрым шагом через промзону. В самом кофе мест очень мало, а желающих очень много(( пирожные очень вкусные, кофе…бывает вкуснее. Второй раз именно туда на пойду.\n'

In [204]:
%%time
res = pd.DataFrame(map(get_sentiment, coffee['text'].iloc[:20]), columns=['neutral', 'positive', 'negative'])
res.head()

CPU times: user 5.79 s, sys: 6.71 ms, total: 5.79 s
Wall time: 5.94 s


Unnamed: 0,neutral,positive,negative
0,0.735903,0.207019,0.057078
1,0.951965,0.037896,0.010139
2,0.994905,0.0026,0.002495
3,0.688164,0.241964,0.069872
4,0.981382,0.015031,0.003587


## Через потоки

In [156]:
from multiprocessing.pool import ThreadPool

In [205]:
%%time
with ThreadPool() as pool:
    # execute tasks in order
    a = pd.DataFrame([result for result in pool.map(get_sentiment, coffee['text'].iloc[:40])])
a.head()

CPU times: user 14.7 s, sys: 58 ms, total: 14.8 s
Wall time: 11.2 s


Unnamed: 0,0,1,2
0,0.735903,0.207019,0.057078
1,0.951965,0.037896,0.010139
2,0.994905,0.0026,0.002495
3,0.688164,0.241964,0.069872
4,0.981382,0.015031,0.003587


## Через процессы

In [154]:
from multiprocessing import Pool

In [208]:
def get_sentiment(x):
  tmp = sent_class(x, top_k=None)
  return [tmp[i]['score'] for i in range(3)]

In [209]:
%%time
with Pool(os.cpu_count()) as p:
    # execute tasks in order
    a = pd.DataFrame([result for result in p.map(get_sentiment, coffee['text'].iloc[:40])])
a.head()

CPU times: user 118 ms, sys: 236 ms, total: 355 ms
Wall time: 11.7 s


Unnamed: 0,0,1,2
0,0.735903,0.207019,0.057078
1,0.951965,0.037896,0.010139
2,0.994905,0.0026,0.002495
3,0.688164,0.241964,0.069872
4,0.981382,0.015031,0.003587


## В одном потоке

In [210]:
%%time
a = pd.DataFrame([result for result in map(get_sentiment, coffee['text'].iloc[:40])])
a.head()

CPU times: user 10.8 s, sys: 37.6 ms, total: 10.8 s
Wall time: 11.3 s


Unnamed: 0,0,1,2
0,0.735903,0.207019,0.057078
1,0.951965,0.037896,0.010139
2,0.994905,0.0026,0.002495
3,0.688164,0.241964,0.069872
4,0.981382,0.015031,0.003587


In [212]:
117000/40*11/60/60 / 16

0.55859375

## Через GPU & PyTorch

- https://www.youtube.com/watch?v=QEaBAZQCtwE

In [186]:
!pip install torch -q

In [189]:
import torch
import torch.nn.functional as F

from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [190]:
model_name = 'MonoHime/rubert-base-cased-sentiment-new'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [192]:
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [222]:
X_train = coffee['text'].iloc[:40].to_list()

In [223]:
res = classifier(X_train)
res

[{'label': 'NEUTRAL', 'score': 0.7359028458595276},
 {'label': 'POSITIVE', 'score': 0.9519652128219604},
 {'label': 'NEGATIVE', 'score': 0.9949054718017578},
 {'label': 'NEUTRAL', 'score': 0.6881638169288635},
 {'label': 'POSITIVE', 'score': 0.9813822507858276},
 {'label': 'NEGATIVE', 'score': 0.7515448927879333},
 {'label': 'NEGATIVE', 'score': 0.7515041828155518},
 {'label': 'POSITIVE', 'score': 0.9794765114784241},
 {'label': 'POSITIVE', 'score': 0.9804792404174805},
 {'label': 'POSITIVE', 'score': 0.9813034534454346},
 {'label': 'NEUTRAL', 'score': 0.6931285858154297},
 {'label': 'NEGATIVE', 'score': 0.7514904737472534},
 {'label': 'POSITIVE', 'score': 0.9805396795272827},
 {'label': 'POSITIVE', 'score': 0.9813339114189148},
 {'label': 'NEGATIVE', 'score': 0.7512839436531067},
 {'label': 'NEGATIVE', 'score': 0.7514641880989075},
 {'label': 'NEUTRAL', 'score': 0.6454037427902222},
 {'label': 'POSITIVE', 'score': 0.9804716110229492},
 {'label': 'POSITIVE', 'score': 0.9812301397323608

In [224]:
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors='pt')
print(batch)

{'input_ids': tensor([[  101, 13776,  5491,  ...,     0,     0,     0],
        [  101,   146,   883,  ...,     0,     0,     0],
        [  101,  1067,  1094,  ...,     0,     0,     0],
        ...,
        [  101,  8274,  1691,  ...,     0,     0,     0],
        [  101,  8274,  1691,  ...,     0,     0,     0],
        [  101,  3226,  1094,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [226]:
%%time
with torch.no_grad():
  outputs = model(**batch)
  #print(outputs)
  predictions = F.softmax(outputs.logits, dim=1)
  labels = torch.argmax(predictions, dim=1)
  #print(predictions)
  #print(labels)

CPU times: user 26.2 s, sys: 4.54 s, total: 30.7 s
Wall time: 31 s
