### Preparation

In [56]:
!pip install transformers

In [57]:
!pip install emoji

In [58]:
!pip install google-play-scraper

In [59]:
import pandas as pd
import numpy as np
# from tqdm.auto import tqdm
import tqdm
import torch
import re
from torch.autograd import Variable
import emoji
import string
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from torch.optim import Adadelta
from sklearn.model_selection import train_test_split
from sklearn import metrics
import torch.nn.functional as F
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import os

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

from wordcloud import WordCloud
from IPython.display import Image

%matplotlib inline
%config InlineBackend.figure_format='retina'

import matplotlib.pyplot as plt
import itertools
import math
from IPython import display

from transformers import AutoConfig, AutoModel

RANDOM_SEED = 42

In [60]:
import random
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Some cudnn methods can be random even after fixing the seed 
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything()

In [61]:
URL_TRAIN = 'https://github.com/google-research/google-research/raw/master/goemotions/data/train.tsv'
URL_DEV = 'https://github.com/google-research/google-research/raw/master/goemotions/data/dev.tsv'
URL_TEST = 'https://github.com/google-research/google-research/raw/master/goemotions/data/test.tsv'

In [62]:
train_df = pd.read_csv(URL_TRAIN, encoding="utf-8", sep="\t", header=None, names=['text', 'labels', '-'])
dev_df = pd.read_csv(URL_DEV, encoding="utf-8", sep="\t", header=None, names=['text', 'labels', '-'])
test_df = pd.read_csv(URL_TEST, encoding="utf-8", sep="\t", header=None, names=['text', 'labels', '-'])

In [63]:
train_df = train_df.append(dev_df)

In [64]:
train_df.shape

In [65]:
del train_df['-']
del test_df['-']

In [66]:
lengths = train_df['text'].apply(lambda x: len(x))
items = list(lengths)
sns.histplot(items)
plt.xlim([0, 512]);

In [67]:
len(train_df), len(test_df)

In [68]:
train_df.head(5)

In [69]:
train_df['labels list'] = train_df['labels'].apply(lambda x: x.split(','))
train_df['labels length'] = train_df['labels list'].apply(lambda x: len(x))

In [70]:
train_df

In [71]:
test_df['labels list'] = test_df['labels'].apply(lambda x: x.split(','))
test_df['labels length'] = test_df['labels list'].apply(lambda x: len(x))

In [72]:
EMOTIONS_LABELS = [
    'admiration',
    'amusement',
    'anger',
    'annoyance',
    'approval',
    'caring',
    'confusion',
    'curiosity',
    'desire',
    'disappointment',
    'disapproval',
    'disgust',
    'embarrassment',
    'excitement',
    'fear',
    'gratitude',
    'grief',
    'joy',
    'love',
    'nervousness',
    'optimism',
    'pride',
    'realization',
    'relief',
    'remorse',
    'sadness',
    'surprise',
    'neutral'
]

In [73]:
ekman_mapping = {
  "anger": ["anger", "annoyance", "disapproval"],
  "disgust": ["disgust"],
  "fear": ["fear", "nervousness"],
  "joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
  "sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
  "surprise": ["surprise", "realization", "confusion", "curiosity"],
  "neutral": ["neutral"]
}

In [74]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(EMOTIONS_LABELS[int(i)])
    return arr

In [75]:
train_df['emotions'] = train_df['labels list'].apply(idx2class)
test_df['emotions'] = test_df['labels list'].apply(idx2class)

In [76]:
train_df_more_labels = train_df[train_df['labels list'].apply(lambda x: len(x)>1)]

In [77]:
train_df_more_labels.iloc[11].text

### DF Operations

In [78]:
sampled_train_df = train_df.sample(frac=1)

In [79]:
len(train_df), len(sampled_train_df)

In [80]:
def reduce_emotions(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['anger']:
            map_list.append('anger')
        if i in ekman_mapping['disgust']:
            map_list.append('disgust')
        if i in ekman_mapping['fear']:
            map_list.append('fear')
        if i in ekman_mapping['joy']:
            map_list.append('joy')
        if i in ekman_mapping['sadness']:
            map_list.append('sadness')
        if i in ekman_mapping['surprise']:
            map_list.append('surprise')
        if i == 'neutral':
            map_list.append('neutral')
            
    return map_list

In [81]:
sampled_train_df['emotions'] = sampled_train_df['emotions'].apply(reduce_emotions)
test_df['emotions'] = test_df['emotions'].apply(reduce_emotions)

In [82]:
sampled_train_df

In [83]:
sampled_train_df.iloc[1].text

In [84]:
ekman_emotions = list(ekman_mapping.keys())
ekman_emotions

In [85]:
for emotion in ekman_emotions:
    sampled_train_df[emotion] = sampled_train_df['emotions'].apply(lambda x: 1 if emotion in x else 0)
    test_df[emotion] = test_df['emotions'].apply(lambda x: 1 if emotion in x else 0)

In [86]:
sampled_train_df.drop(sampled_train_df[sampled_train_df['neutral'] == 1].index, inplace=True)
test_df.drop(test_df[test_df['neutral'] == 1].index, inplace=True)
sampled_train_df.drop(sampled_train_df[sampled_train_df['disgust'] == 1].index, inplace=True)
test_df.drop(test_df[test_df['disgust'] == 1].index, inplace=True)

In [87]:
sampled_train_df.head(10)

In [88]:
test_df

In [89]:
sampled_train_df = sampled_train_df.drop(['neutral', 'disgust'], axis = 1)
test_df = test_df.drop(['neutral', 'disgust'], axis = 1)

In [90]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                       "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                       "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                       "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s':'america', 'e.g':'for example'}

punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
                 "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':' '}

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization',
                'demonetisation': 'demonetization'}

In [91]:
sampled_train_df[['anger', 'fear', 'joy', 'sadness', 'surprise']].sum().sort_values().plot(kind="barh");

In [92]:
def clean_text(text):
    '''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = emoji.demojize(text)
    text = re.sub(r'\:(.*?)\:','',text)
    text = str(text).lower()    #Making Text Lowercase
    text = re.sub('\[.*?\]', '', text)
    #The next 2 lines remove html text
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text

def clean_contractions(text, mapping):
    '''Clean contraction using contraction mapping'''    
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+mapping[word]+"")
    #Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text

def clean_special_chars(text, punct, mapping):
    '''Cleans special characters present(if any)'''   
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def correct_spelling(x, dic):
    '''Corrects common spelling errors'''   
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def remove_space(text):
    '''Removes awkward spaces'''   
    #Removes awkward spaces 
    text = text.strip()
    text = text.split()
    return " ".join(text)

def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = clean_text(text)
    text = clean_contractions(text, contraction_mapping)
    text = clean_special_chars(text, punct, punct_mapping)
    text = correct_spelling(text, mispell_dict)
    text = remove_space(text)
    return text

In [93]:
sampled_train_df['text'] = sampled_train_df['text'].apply(text_preprocessing_pipeline)
test_df['text'] = test_df['text'].apply(text_preprocessing_pipeline)

In [94]:
print(sampled_train_df.shape)
print(test_df.shape)

In [95]:
target_cols = [
 'anger',
 'fear',
 'joy',
 'sadness',
 'surprise']

In [96]:
sampled_train_df = sampled_train_df[['text', *target_cols]]

In [97]:
sampled_train_df

In [98]:
def clean_text(text):
    '''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = emoji.demojize(text)
    text = re.sub(r'\:(.*?)\:','',text)
    text = str(text).lower()    #Making Text Lowercase
    text = re.sub('\[.*?\]', '', text)
    #The next 2 lines remove html text
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text

def clean_contractions(text, mapping):
    '''Clean contraction using contraction mapping'''    
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+mapping[word]+"")
    #Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text

def clean_special_chars(text, punct, mapping):
    '''Cleans special characters present(if any)'''   
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def correct_spelling(x, dic):
    '''Corrects common spelling errors'''   
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def remove_space(text):
    '''Removes awkward spaces'''   
    #Removes awkward spaces 
    text = text.strip()
    text = text.split()
    return " ".join(text)

def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = clean_text(text)
    text = clean_contractions(text, contraction_mapping)
    text = clean_special_chars(text, punct, punct_mapping)
    text = correct_spelling(text, mispell_dict)
    text = remove_space(text)
    return text

In [99]:
lengths = sampled_train_df['text'].apply(lambda x: len(x))
items = list(lengths)
sns.histplot(items)
plt.xlim([0, 512]);

In [100]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, do_lower_case=True)

In [101]:
token_counts = []
for _, row in train_df.iterrows():
  token_count = len(tokenizer.encode(
    row["text"],
    max_length=MAX_LEN,
    truncation=True
  ))
  token_counts.append(token_count)
sns.histplot(token_counts)
plt.xlim([0, 512]);

In [102]:
sampled_train_df

### Dataset & Data Loaders

In [103]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = dataframe[target_cols]
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }

In [104]:
train_dataset = CustomDataset(
  sampled_train_df,
  tokenizer,
  max_len=MAX_LEN
)

test_dataset = CustomDataset(
  test_df,
  tokenizer,
  max_len=MAX_LEN
)

In [105]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

train_loader = DataLoader(train_dataset, **train_params)
test_loader = DataLoader(test_dataset, **test_params)

In [106]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Model

In [107]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('bert-base-uncased')
#         self.l2 = torch.nn.Dropout(0.15)
        self.l3 = torch.nn.Linear(768, 5)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output)
        output = self.l3(output)
        return output

model = BERTClass()
model.to(device)

In [108]:
import random
import time

loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [109]:
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch + 1}, Loss:  {loss.item()}')
        
        # Update parameters and the learning rate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

### Start Training

In [110]:
for epoch in range(EPOCHS):
    train(epoch)

In [111]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [112]:
# for epoch in range(EPOCHS):
outputs, targets = validation(1) # epoch
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [113]:
from google_play_scraper import Sort, reviews, app

In [114]:
app_packages = [
#   'com.todoist',
#   'com.tasks.android',
#   'com.appxy.planner',
  'trendyol.com'
]

In [115]:
# Code from https://github.com/JoMingyu/google-play-scraper
app_infos = []

for ap in tqdm.tqdm(app_packages):
    info = app(ap, 
               lang='en', # defaults to 'en'
               country='us') # defaults to 'us'
    del info['comments']
    app_infos.append(info)

In [116]:
from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter
import json

def print_json(json_object):
    json_str = json.dumps(
    json_object,
    indent=2,
    sort_keys=True,
    default=str
  )
    print(highlight(json_str, JsonLexer(), TerminalFormatter()))

In [117]:
print_json(app_infos[0])

In [118]:
app_infos_df = pd.DataFrame(app_infos)

In [119]:
app_infos_df

In [120]:
app_reviews = []

for ap in tqdm.tqdm(app_packages):
    for score in list(range(1, 6)):
        for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
            rvs, _ = reviews(
                ap,
                lang='en', # defaults to 'en'
                country='us',# defaults to 'us'
                sort=sort_order,
                count= 200 if score == 3 else 100,
                filter_score_with=score # defaults to None(means all score)
              )
            for r in rvs:
                r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
                r['appId'] = ap
            app_reviews.extend(rvs)

In [121]:
print_json(app_reviews[0])

In [122]:
len(app_reviews)

In [123]:
app_reviews_df = pd.DataFrame(app_reviews)

In [124]:
app_reviews_df

In [125]:
app_reviews_df['text'] = app_reviews_df['content'].apply(text_preprocessing_pipeline)

In [126]:
app_reviews_df = app_reviews_df[['text']]

In [127]:
reviews_predictions = []
for review in app_reviews_df['text']:
    encoded_review = tokenizer.encode_plus(
      review,
      max_length=MAX_LEN,
      add_special_tokens=True,
      return_token_type_ids=True,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    token_type_ids = encoded_review['token_type_ids'].to(device, dtype = torch.long)

    output = model(input_ids, attention_mask, token_type_ids)
    prediction = np.where(nn.Sigmoid()(output).cpu().detach().numpy() >= 0.5, 1, 0).flatten()
    reviews_predictions.append(prediction)

In [128]:
app_reviews_df['predictions'] = reviews_predictions

In [129]:
def ohe_to_labels(idx_list):
    arr = []
    for i, emotion in enumerate(idx_list):
        if emotion:
            arr.append(target_cols[i])
    return arr

In [130]:
ohe_to_labels([1, 0, 0, 1])

In [131]:
app_reviews_df['emotions labels'] = app_reviews_df['predictions'].apply(ohe_to_labels)

In [132]:
app_reviews_df.head()

### Testing Our Own Text

In [133]:
lbs_counts_more_than_1 = app_reviews_df[app_reviews_df['emotions labels'].apply(lambda x: len(x) > 1)]

In [134]:
lbs_counts_more_than_1

In [140]:
def get_emotions_labels(text):
    encoded_review = tokenizer.encode_plus(
      text,
      max_length=MAX_LEN,
      add_special_tokens=True,
      return_token_type_ids=True,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    token_type_ids = encoded_review['token_type_ids'].to(device, dtype = torch.long)

    output = model(input_ids, attention_mask, token_type_ids)
    probs = np.where(nn.Sigmoid()(output).cpu().detach().numpy() >= 0.5, 1, 0).flatten()
    labels = ohe_to_labels(probs)
    return (text, labels, output, probs)

get_emotions_labels('Greate produts but uses too much energy unfortunately')