# Notebook to Generate BERT Encodings using a pre-trained BERT model

We have used https://pypi.org/project/sentence-transformers/
to generate the embeddings for our training data. Once these embeddings are done we have trained a classfier using Logistic Regression to validate the notion that we can use pre-trained embeddings for the text of the web pages to classify them as Faculty vs Non Faculty. Please note that any classifier (e.g. XGBoost, SVM etc.) can  be trained using these embedding to improve performance over Logistic Regression.


In [3]:
import pandas as pd

In [6]:

negative_data = []
with open('data/classificationData/negative.txt', 'r') as f:
    for line in f:
        data = line.split("#####")
        negative_data.append(data[0].strip())

len(negative_data)

1931

In [7]:
positive_data = []
with open('data/classificationData/positive.txt', 'r') as f:
    for line in f:
        data = line.split("#####")
        positive_data.append(data[0].strip())

len(positive_data)

6521

In [18]:
combined_data_x = []
combined_data_y = []
for data in negative_data:
    combined_data_x.append(data)
    combined_data_y.append(0)

counter = 1
for data in positive_data:
    if counter <= len(negative_data):
        combined_data_x.append(data)
        combined_data_y.append(1)
    else:
        break
    counter += 1
(len(combined_data_x), len(combined_data_y))

(3862, 3862)

In [15]:
df = pd.DataFrame(combined_data, columns =['web-page-text', 'Label'])  

In [11]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig

In [None]:
MODEL_CLASSES = {
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)
}
b

In [19]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [20]:
embeddings = model.encode(combined_data_x)

In [21]:
len(embeddings)

3862

In [22]:
dir(embeddings)

['T',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_finalize__',
 '__array_function__',
 '__array_interface__',
 '__array_prepare__',
 '__array_priority__',
 '__array_struct__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__complex__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__ilshift__',
 '__imatmul__',
 '__imod__',
 '__imul__',
 '__index__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__irshift__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__

In [23]:
embeddings[0]

array([-4.79919106e-01, -1.64380893e-01,  5.04807532e-01, -6.55960143e-01,
       -1.06359690e-01, -3.39308739e-01, -2.05850378e-01, -4.57326114e-01,
        2.28776149e-02, -2.80305475e-01,  3.03139836e-02,  5.55711329e-01,
       -3.34782034e-01,  8.96301150e-01,  6.43501818e-01, -1.44729123e-01,
       -5.18683434e-01, -3.15438747e-01,  4.71709892e-02,  1.54703051e-01,
        4.61170673e-01,  3.92214179e-01,  1.79433122e-01,  1.25565016e+00,
       -9.75942135e-01, -7.91273490e-02,  8.32066238e-02,  1.51156381e-01,
        1.72183231e-01,  1.67126596e-01, -4.75989245e-02, -1.69733167e-01,
        8.06320086e-02,  3.14925879e-01,  5.76777041e-01,  5.76354973e-02,
        8.21184039e-01,  1.11209571e+00,  8.91262472e-01,  4.37721044e-01,
       -5.12461543e-01, -1.57487616e-01,  3.25724006e-01,  1.18725586e+00,
        2.11626172e-01, -7.04340518e-01,  9.76675376e-02, -5.28942108e-01,
       -8.56056809e-01, -4.39239256e-02,  1.34506943e-02, -5.18116295e-01,
        4.50337261e-01, -

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import os
import pickle
import numpy as np

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(embeddings, combined_data_y, test_size = 0.2, random_state=42)
X_train=np.array(X_train)
# Y_train = np.array(Y_train)
Y_train
# (X_train.shape, Y_train.shape)

[0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,


In [38]:
    # fit model no training data
    logisticRegr = LogisticRegression()
    logisticRegr.fit(X_train, Y_train)

    y_pred = logisticRegr.predict(X_test)
    predictions = [round(value) for value in y_pred]

    print(f"F1:score: {f1_score(Y_test, y_pred, average=None)[0]}")


F1:score: 0.9874055415617129


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
import pickle
pickle.dump(embeddings, open('bert-embeddings-for-classification.pkl', 'wb'))

In [41]:
embeddings = pickle.load(open('bert-embeddings-for-classification.pkl', 'rb'))

In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(embeddings, combined_data_y, test_size = 0.2, random_state=42)
X_train=np.array(X_train)
    # fit model no training data
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, Y_train)

y_pred = logisticRegr.predict(X_test)
predictions = [round(value) for value in y_pred]

print(f"F1:score: {f1_score(Y_test, y_pred, average=None)[0]}")

F1:score: 0.9874055415617129


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
