In [1]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from datetime import datetime
import pickle, json
from utils.helpers import *
import os.path
from transformers import pipeline
from transformers import BertTokenizer, BertModel
import torch

# Import data

In [2]:
all_feedback = pd.read_csv('../data/all_feedback_consolidated.csv')
all_feedback = all_feedback[['country', 'channel', 'Question', 'Text_Eng']]

# Remove texts with length <= 10. These rows are tagged as Others by default
short_text = all_feedback[all_feedback['Text_Eng'].str.len() <= 10].reset_index(drop=True)
all_feedback = all_feedback[all_feedback['Text_Eng'].str.len() > 10].reset_index(drop=True)

In [3]:
all_feedback.columns = ['country', 'channel', 'Question', 'sequence']

In [4]:
all_feedback.shape

(10159, 4)

# Import models

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("../models/finetuned_bert/")

Some weights of the model checkpoint at ../models/finetuned_bert/ were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../models/finetuned_bert/ and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.

In [6]:
pca = pickle.load(open('../models/pca_bert_tuned.sav', 'rb'))

In [7]:
ovr_svc_bert_tuned_pca = pickle.load(open('../models/bert_tuned_ovr_svc.sav', 'rb'))

In [9]:
all_vectorized = preprocess_bert(model, tokenizer, all_feedback, batch_size=1)

Batch 0 started at 09-03-2022 21:13:33
Batch 100 started at 09-03-2022 21:16:09
Batch 200 started at 09-03-2022 21:19:02
Batch 300 started at 09-03-2022 21:22:28
Batch 400 started at 09-03-2022 21:25:54
Batch 500 started at 09-03-2022 21:29:20
Batch 600 started at 09-03-2022 21:32:46
Batch 700 started at 09-03-2022 21:36:14
Batch 800 started at 09-03-2022 21:39:48
Batch 900 started at 09-03-2022 21:43:15
Batch 1000 started at 09-03-2022 21:46:45
Batch 1100 started at 09-03-2022 21:50:14
Batch 1200 started at 09-03-2022 21:53:41
Batch 1300 started at 09-03-2022 21:57:07
Batch 1400 started at 09-03-2022 21:59:45
Batch 1500 started at 09-03-2022 22:02:20
Batch 1600 started at 09-03-2022 22:04:54
Batch 1700 started at 09-03-2022 22:07:29
Batch 1800 started at 09-03-2022 22:10:05
Batch 1900 started at 09-03-2022 22:12:49
Batch 2000 started at 09-03-2022 22:16:15
Batch 2100 started at 09-03-2022 22:19:43
Batch 2200 started at 09-03-2022 22:23:11
Batch 2300 started at 09-03-2022 22:26:38
Batc

In [11]:
all_vectorized.to_csv('../data/all_bert_tuned_vectorized.csv', index=False)

In [12]:
labels = [
    'communication', 'waiting time',
       'information', 'user interface',
       'facilities', 'location', 'price'
]

# Holdout set evaluation

In [13]:
holdout = pd.read_csv('../data/holdout.csv')

In [14]:
holdout_vectorized = preprocess_bert(model, tokenizer, holdout, batch_size=1)

Batch 0 started at 10-03-2022 02:33:42
Shape of bert vectors: (31, 768)
Shape of final dataframe: (31, 776)


In [15]:
holdout_vectorized_pca = pca.transform(holdout_vectorized[range(768)])

In [16]:
holdout_prob_df = pd.DataFrame(ovr_svc_bert_tuned_pca.predict(holdout_vectorized_pca), columns=labels)
holdout_prob_df.head()

Unnamed: 0,communication,waiting time,information,user interface,facilities,location,price
0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [19]:
print(classification_report(holdout[labels], holdout_prob_df, target_names=labels, zero_division=0))

                precision    recall  f1-score   support

 communication       1.00      0.30      0.46        10
  waiting time       0.53      0.62      0.57        13
   information       0.25      0.12      0.16        17
user interface       0.75      0.60      0.67         5
    facilities       0.00      0.00      0.00         6
      location       0.00      0.00      0.00         3
         price       0.67      0.50      0.57         4

     micro avg       0.55      0.31      0.40        58
     macro avg       0.46      0.30      0.35        58
  weighted avg       0.48      0.31      0.35        58
   samples avg       0.43      0.35      0.35        58



# SVC Predictions with Tuned Bert

In [20]:
all_vectorized_pca = pca.transform(all_vectorized[range(768)])

In [23]:
svc_prob_df = pd.DataFrame(ovr_svc_bert_tuned_pca.predict(all_vectorized_pca), columns=labels)
svc_prob_df.head()

Unnamed: 0,communication,waiting time,information,user interface,facilities,location,price
0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0


In [24]:
# For texts that are not predicted to be in any category, tag as Others
svc_predictions = all_feedback.join(svc_prob_df)
svc_predictions['others'] = svc_predictions[labels].max(axis=1)
svc_predictions['others'] = svc_predictions['others'].apply(lambda x: 1 if x == 0 else 0)

In [25]:
svc_predictions.others.value_counts()

0    8477
1    1682
Name: others, dtype: int64

# Add back short text rows

In [30]:
short_text[labels] = 0
short_text['others'] = 1

In [31]:
short_text.columns = ['country', 'channel', 'Question', 'sequence', 
       'communication', 'waiting time',
       'information', 'user interface', 'price', 'location', 'facilities', 'others']

In [32]:
final_dataframe = pd.concat([svc_predictions, short_text]).reset_index(drop=True)

# Get sentiment labels

In [33]:
def get_sentiment_label_facebook(list_of_sent_dicts):
    if list_of_sent_dicts['labels'][0] == 'negative':
        return 0
    else:
        return 1

In [34]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device=0, framework='pt'
                     )

In [1]:
final_dataframe['sentiment'] = final_dataframe['sequence'].apply(lambda x: get_sentiment_label_facebook(classifier(x, candidate_labels=['positive', 'negative'], hypothesis_template='The sentiment of this is {}')))

In [36]:
final_dataframe.to_csv('../data/final_outputs_bert_tuned.csv', index=False)