In [2]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from datetime import datetime
import pickle, json
from utils.helpers import *
import os.path
from transformers import pipeline
from gensim.models.keyedvectors import KeyedVectors

# Import data

In [3]:
all_feedback = pd.read_csv('../data/all_feedback_consolidated.csv')
all_feedback = all_feedback[['country', 'channel', 'Question', 'Text_Eng']]

# Remove rows with text length <= 10. These rows will be tagged as Others by default
short_text = all_feedback[all_feedback['Text_Eng'].str.len() <= 10].reset_index(drop=True)
all_feedback = all_feedback[all_feedback['Text_Eng'].str.len() > 10].reset_index(drop=True)

In [4]:
all_feedback.columns = ['country', 'channel', 'Question', 'sequence']

In [5]:
all_feedback.shape

(10159, 4)

# Import models

In [6]:
w2v = KeyedVectors.load('../models/word2vec')

In [7]:
w2v_ovr_svc = pickle.load(open('../models/w2v_ovr_svc.sav', 'rb'))

In [8]:
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli", device=0, framework='pt'
                     )

In [1]:
# Vectorize full corpus
text_vectorized = preprocess_w2v(all_feedback, w2v)

In [11]:
labels = [
    'communication', 'waiting time',
       'information', 'user interface',
       'facilities', 'location', 'price'
]

# Holdout set evaluation

In [12]:
holdout = pd.read_csv('../data/holdout.csv')

In [2]:
holdout_vectorized = preprocess_w2v(holdout, w2v)

In [14]:
holdout_prob_df = pd.DataFrame(w2v_ovr_svc.predict(holdout_vectorized[range(300)]), columns=labels)
holdout_prob_df.head()

Unnamed: 0,communication,waiting time,information,user interface,facilities,location,price
0,0,0,1,1,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,1
4,0,0,1,0,0,0,0


In [15]:
print(classification_report(holdout[labels], holdout_prob_df, target_names=labels, zero_division=0))

                precision    recall  f1-score   support

 communication       1.00      0.50      0.67        10
  waiting time       1.00      0.38      0.56        13
   information       0.71      0.71      0.71        17
user interface       0.62      1.00      0.77         5
    facilities       1.00      0.17      0.29         6
      location       0.50      0.67      0.57         3
         price       0.67      1.00      0.80         4

     micro avg       0.74      0.59      0.65        58
     macro avg       0.79      0.63      0.62        58
  weighted avg       0.83      0.59      0.63        58
   samples avg       0.70      0.67      0.65        58



# Make predictions using trained SVC model 

In [16]:
svc_prob_df = pd.DataFrame(w2v_ovr_svc.predict(text_vectorized[range(300)]), columns=labels)
svc_prob_df.head()

Unnamed: 0,communication,waiting time,information,user interface,facilities,location,price
0,0,0,1,1,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,1,0,0,0,1
4,0,0,1,0,0,0,0


In [17]:
# For texts that are not predicted to reside in any category, tag as Others
svc_predictions = all_feedback.join(svc_prob_df)
svc_predictions['others'] = svc_predictions[labels].max(axis=1)
svc_predictions['others'] = svc_predictions['others'].apply(lambda x: 1 if x == 0 else 0)

In [18]:
svc_predictions.others.value_counts()

0    8938
1    1221
Name: others, dtype: int64

# Add back short text rows

In [22]:
svc_predictions = svc_predictions.drop(columns=['sequence_clean'])

In [23]:
short_text[labels] = 0
short_text['others'] = 1

In [24]:
short_text.columns = ['country', 'channel', 'Question', 'sequence', 'communication',
       'waiting time', 'information', 'user interface', 'facilities',
       'location', 'price', 'others']

In [25]:
final_dataframe = pd.concat([svc_predictions, short_text]).reset_index(drop=True)

# Get sentiment labels

In [26]:
def get_sentiment_label_facebook(list_of_sent_dicts):
    if list_of_sent_dicts['labels'][0] == 'negative':
        return 0
    else:
        return 1

In [3]:
final_dataframe['sentiment'] = final_dataframe['sequence'].apply(lambda x: get_sentiment_label_facebook(classifier(x, candidate_labels=['positive', 'negative'], hypothesis_template='The sentiment of this is {}')))

In [28]:
final_dataframe.to_csv('../data/final_outputs.csv', index=False)