In [1]:
import vk
import pandas as pd

In [2]:
token = ''
session = vk.Session(access_token=token)  # VK API authorization
vk_api = vk.API(session)

In [115]:
def get_comments_from_vk_group(owner_id):
    
    #getting the ids of 100 posts
    group_posts_info = vk_api.wall.get(owner_id = owner_id, v = 5.95, count = 100)
    group_post_ids = []
    group_posts_items = group_posts_info['items']
    for i in group_posts_items:
      group_post_ids.append(i['id'])

    #getting the comment from these posts
    group_comments = []
    for group_post_id in group_post_ids:
      group_comments_info = vk_api.wall.getComments(owner_id = owner_id, post_id = group_post_id, v = 5.95, count = 100)
      group_comments_items = group_comments_info['items']
      for i in group_comments_items:
        group_comments.append(i['text'])
    print(f'Загружено {len(group_comments)} комментариев')
    group_comments = pd.DataFrame(group_comments)
    return group_comments

In [116]:
#Downloading the comments from Lentach group

lentach_comments = get_comments_from_vk_group(-29534144)

Загружено 3978 комментариев


In [3]:
from sklearn.model_selection import train_test_split

#Loading labeled dataset and splitting 80/20 train/test

training = pd.read_csv('labeled_2ch_pikabu.csv')
train_data = training.comment
train_labels = training.toxic

first_training_data, first_test_data, first_training_labels, first_test_labels = train_test_split(train_data, train_labels, test_size = 0.2)

In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
import matplotlib.pyplot as plt

#Loading tensorflow libs and bert model

tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [73]:
#Setting a model

def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

binary_accuracy=tf.metrics.BinaryAccuracy()

epochs = 7
steps_per_epoch = 15000/32
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=[binary_accuracy,f1_m,precision_m, recall_m])

print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=first_training_data, y=first_training_labels, epochs=epochs)

loss, accuracy = classifier_model.evaluate(x=first_test_data, y=first_test_labels)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

history_dict = history.history
print(history_dict.keys())


Training model with https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


ValueError: Data cardinality is ambiguous:
  x sizes: 2885
  y sizes: 2884
Make sure all arrays contain the same number of samples.

In [76]:
#Test split results

print(classifier_model.evaluate(x=first_test_data, y=first_test_labels))

[0.34427452087402344, 0.8560526967048645, 0.7551085948944092, 0.8723870515823364, 0.6829580068588257]


In [98]:
#Uploading test data with predictions

test_data_df = pd.DataFrame(first_test_data)
test_data_df.reset_index(drop=True, inplace=True)
test_data_df['labels'] = pd.DataFrame(first_test_labels)
test_data_df['predictions'] = pd.DataFrame(predictions)
first_test_labels.reset_index(drop=True, inplace=True)
test_data_df

test_data_df.to_excel('test_df_second.xlsx')

In [77]:
#Real world data classification (unlabeled comments from Lentach group)

lentach_comments = pd.DataFrame(lentach_comments)
lentach_comments

lentach_comments = lentach_comments.drop(['prediction'], axis = 1)
lentach_comments

lentach_predictions = classifier_model.predict(lentach_comments)
lentach_predictions

Unnamed: 0,0,prediction
0,мой палец: *чуть влажный*\n\nсканер отпечатка ...,-1.165070
1,"Ахренкть, адмен украл мой коммент к клмменту, ...",-1.956791
2,У топы был классный видос на эту тему,1.200571
3,чтобы не попасть в тюрьму носите перчатки и маски,-0.626574
4,Первая два три четыре пять,0.505516
...,...,...
4117,"интересно, а какие страны входят теперь в сбп?...",-0.095713
4118,Ну и на ху я азиатам лёд?,1.460273
4119,Лишившись льдов в горах лишатся рек . Узбекист...,1.627323
4120,Нужно реорганизовывать контроль цифровых систе...,-0.012236


In [87]:
#Export real world data predictions

lentach_comments['prediction'] = lentach_predictions

lentach_comments.to_excel('second_try_lentach_predictions.xlsx')

In [117]:
classifier_model.save('bert_for_toxic_classification.h5')