# BERT MODEL

### Libraries

In [None]:
## Change the runtime to GPU
## Uncomment the below line to install ktrain library

#!pip install ktrain
# import nltk
# nltk.download('stopwords')

In [1]:
## Download and import libraries

import numpy as np
import pandas as pd
import tensorflow as tf
import ktrain
from ktrain import text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Dataset

In [2]:
## Read train and test dataset
train_df=pd.read_excel('/content/P1_training.xlsx')
test_df=pd.read_excel('/content/P1_testing.xlsx')
train_df

Unnamed: 0,sentence,label
0,living in a concentration camp-like atmosphere...,1
1,"there's even a nod to "" the blues brothers , ""...",1
2,"park , lord , and screenwriter karey kirkpatri...",1
3,"ginger is perfect , spunky and opinionated , b...",2
4,jane horrocks delivers a lovely voice characte...,2
...,...,...
1655,"lin shae , who plays mary's neighbor magda ( a...",2
1656,steve martin took an extended vacation from al...,2
1657,much of the book spares tinseltown from mocker...,2
1658,"now , as writer and star of bowfinger , he off...",1


In [3]:
## Splitting training data for validation in BERT model

train_len=len(train_df)
val_df=train_df.iloc[int(0.9*train_len):,:]
train_df=train_df.iloc[:int(0.9*train_len),:]
train_df

Unnamed: 0,sentence,label
0,living in a concentration camp-like atmosphere...,1
1,"there's even a nod to "" the blues brothers , ""...",1
2,"park , lord , and screenwriter karey kirkpatri...",1
3,"ginger is perfect , spunky and opinionated , b...",2
4,jane horrocks delivers a lovely voice characte...,2
...,...,...
1489,"moreover , brosnan has at last settled into th...",2
1490,and though carver's motives are muddy ( when c...,2
1491,"( 1996 ) starring jennifer jason leigh , ron e...",2
1492,"in her directorial debut , bastard out of caro...",2


## BERT MODEL
### Using pretrained BERT model (uncased_L-12_H-768_A-12)

In [4]:
## Preprocess the dataset based on BERT's inbuilt preprocessing
## This cell downloads the pretrained BERT model (uncased_L-12_H-768_A-12)
(X_train,Y_train),(X_val,Y_val),preprocess=text.texts_from_df(train_df=train_df,
                   text_column='sentence',
                   label_columns='label',
                   val_df=val_df,
                   maxlen=100,
                   preprocess_mode='bert')


preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [5]:
## Build the model classifier  
model=text.text_classifier(name='bert',
                           train_data=(X_train,Y_train),
                           preproc=preprocess)
learner=ktrain.get_learner(model=model,
                           train_data=(X_train,Y_train),
                           val_data=(X_val,Y_val),
                           batch_size=32)


Is Multi-Label? False
maxlen is 100
done.


In [6]:
## Fit the BERT model on training data
lr_rate=2e-5
epochs=4
learner.fit_onecycle(lr_rate, epochs) 



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7fd73ee1b7f0>

In [7]:
## Get predictions
predictor=ktrain.get_predictor(learner.model,preprocess)
preds=predictor.predict(list(test_df.sentence.values))
predictions=[]
for i in preds:
    predictions.append(int(i[-1]))


### Results

In [8]:
print (classification_report(predictions,test_df.label.values))

              precision    recall  f1-score   support

           0       0.34      0.39      0.36        72
           1       0.72      0.68      0.70       319
           2       0.72      0.74      0.73       292

    accuracy                           0.67       683
   macro avg       0.59      0.60      0.60       683
weighted avg       0.68      0.67      0.68       683



In [9]:
accuracy_score(predictions,test_df.label.values)

0.6734992679355783

In [10]:
temp_df=pd.read_excel('/content/P1_testing.xlsx')
temp_df['predicted_label']=predictions
temp_df=temp_df.rename(columns={"label":"golden_label"})
temp_df.to_csv("/content/testing_output_Proposed.csv")