In [1]:
import torch
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel, BertForPreTraining, BertForMaskedLM
import torch.nn as nn
from tqdm import tqdm, tqdm_notebook
import os
import random

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

RUBERT_PATH = '../../ml_models/ru_conversational_cased_L-12_H-768_A-12_pt'
modelpath = os.path.join(RUBERT_PATH,'pytorch_model.bin')

In [2]:
#!pip install transformers #huggingface

In [3]:
os.path.isfile(os.path.join(RUBERT_PATH,'pytorch_model.bin'))

True

In [4]:
# tokenizer = BertTokenizer.from_pretrained(os.path.join(RUBERT_PATH,'vocab.txt'))
tokenizer = BertTokenizer.from_pretrained(RUBERT_PATH, do_lower_case=False)
config = BertConfig.from_json_file(os.path.join(RUBERT_PATH,'bert_config.json'))
bert = BertForPreTraining.from_pretrained(modelpath, config=config)
# model.eval()

len(tokenizer.vocab)

100792

In [5]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

def get_means(sentence):
    tokenized_text = tokenizer.tokenize(sentence)
    tokenized_text = tokenized_text[:max_input_length-2]
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    segments_ids = [1] * len(tokenized_text)    
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    predictions = bert(tokens_tensor, token_type_ids=segments_tensors)
    _, secondDims, thirdDims = predictions[0].shape
    
    finalVector = []
    
    for i in range(secondDims):
        currentArr = predictions[0][0][i].detach().numpy()
        if len(finalVector) == 0:
            finalVector = currentArr
        else:
            finalVector = np.add(finalVector, currentArr)
    return np.mean(finalVector)


In [6]:
#texts data
texts_df = pd.read_csv('texts_train.txt', sep="\t", header=None)
texts_df.columns = ["text"]
texts_df.head()

Unnamed: 0,text
0,"Сериал очень люблю, но Академия и Земля вызыва..."
1,"думал, что будет лучше идея очень интересна - ..."
2,с творчеством Головачева я познакомился посред...
3,"то-то я и в большое неудовольствие прочитал ""А..."
4,как мне показалось местами сильно смахивает на...


In [7]:
get_means(texts_df['text'][0])

-290.20074

In [8]:
#scores data
if (os.path.isfile('collected_data.csv')):
    scores_df = pd.read_csv('collected_data.csv', dtype='float64')
else:
    scores_df = pd.read_csv('scores_train.txt', sep="\t", header=None, dtype='float64')
    scores_df.columns = ["tonality"]
    vector_means = [get_means(sentence) for sentence in texts_df["text"].tolist()]
    scores_df['vector_means'] = vector_means

scores_df.head()

Unnamed: 0,tonality,vector_means
0,6.0,-290.200745
1,7.0,-183.830475
2,10.0,-440.676788
3,5.0,-231.999832
4,6.0,-362.513824


In [9]:
scores_df.to_csv('collected_data.csv', index = False, header=True)

In [10]:
tone_levels = np.array(scores_df['tonality'])
features = np.array(scores_df['vector_means']).reshape(-1, 1)

In [11]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test  = train_test_split(features, tone_levels, test_size = 0.2, random_state = random.seed(SEED))

print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test .shape)

Training Features Shape: (16000, 1)
Training Labels Shape: (16000,)
Testing Features Shape: (4000, 1)
Testing Labels Shape: (4000,)


In [118]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=100,
                                  learning_rate=0.1,
                                  random_state=random.seed(SEED),
                                  loss='ls',
                                  max_depth=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [123]:
def get_mae(pred, test):
    # Calculate the absolute errors
    errors = abs(pred - test)
    # Print out the mean absolute error (mae)
    print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
    return errors

In [124]:
errors = get_mae(y_pred, y_test)

Mean Absolute Error: 1.74 degrees.


In [125]:
def get_accuracy(errs, test_data):
    # Calculate mean absolute percentage error (MAPE)
    mape = 100 * (errs / test_data)
    # Calculate and display accuracy
    accuracy = 100 - np.mean(mape)
    print('Accuracy:', round(accuracy, 2), '%.')
    return accuracy

In [126]:
get_accuracy(errors, y_test)

Accuracy: 58.37 %.


58.36569578338658

In [127]:
from sklearn import metrics
y_pred = [int(item) for item in y_pred]
print(metrics.classification_report(y_test, y_pred))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           1       0.00      0.00      0.00        74
           2       0.00      0.00      0.00        74
           3       0.00      0.00      0.00       107
           4       0.00      0.00      0.00       169
           5       0.00      0.00      0.00       257
           6       0.14      0.16      0.15       203
           7       0.10      0.52      0.16       353
           8       0.17      0.47      0.25       692
           9       0.00      0.00      0.00       972
          10       0.00      0.00      0.00      1099

    accuracy                           0.14      4000
   macro avg       0.04      0.12      0.06      4000
weighted avg       0.05      0.14      0.07      4000



In [133]:
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(X_train, y_train)
clf_y_pred = clf.predict(X_test)

In [134]:
clf_errors = get_mae(clf_y_pred, y_test)

Mean Absolute Error: 1.92 degrees.


In [135]:
get_accuracy(clf_errors, y_test)

Accuracy: 48.33 %.


48.32517857142857

In [136]:
clf_y_pred = [int(item) for item in clf_y_pred]
print(metrics.classification_report(y_test, clf_y_pred))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           1       0.00      0.00      0.00        74
           2       0.00      0.00      0.00        74
           3       0.00      0.00      0.00       107
           4       0.00      0.00      0.00       169
           5       0.04      0.00      0.01       257
           6       0.03      0.00      0.01       203
           7       0.08      0.01      0.02       353
           8       0.17      0.09      0.11       692
           9       0.27      0.35      0.31       972
          10       0.29      0.60      0.39      1099

    accuracy                           0.27      4000
   macro avg       0.09      0.11      0.08      4000
weighted avg       0.19      0.27      0.20      4000



In [150]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)
knn_y_pred = knn.predict(X_test)

In [151]:
knn_errors = get_mae(knn_y_pred, y_test)

Mean Absolute Error: 1.98 degrees.


In [152]:
get_accuracy(knn_errors, y_test)

Accuracy: 51.32 %.


51.32059523809524

In [153]:
knn_y_pred = [int(item) for item in knn_y_pred]
print(metrics.classification_report(y_test, clf_y_pred))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           1       0.00      0.00      0.00        74
           2       0.00      0.00      0.00        74
           3       0.00      0.00      0.00       107
           4       0.00      0.00      0.00       169
           5       0.04      0.00      0.01       257
           6       0.03      0.00      0.01       203
           7       0.08      0.01      0.02       353
           8       0.17      0.09      0.11       692
           9       0.27      0.35      0.31       972
          10       0.29      0.60      0.39      1099

    accuracy                           0.27      4000
   macro avg       0.09      0.11      0.08      4000
weighted avg       0.19      0.27      0.20      4000

