<a href="https://colab.research.google.com/github/hosein-jamshidian/Sentiment_Analyses_SnappFood/blob/main/Sentiment%20Analysis_SnappFood_Keras%20implement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

## read dataset

In [None]:
!unzip /content/DL-HW3.zip
!rm /content/DL-HW3.zip
!rm /content/DL-HW3/DL-HW3-Description.pdf

In [None]:
train_df=pd.read_csv('/content/DL-HW3/Snappfood-Dataset/train.csv',sep='\t',index_col=0)
val_df=pd.read_csv('/content/DL-HW3/Snappfood-Dataset/dev.csv',sep='\t',index_col=0)
test_df=pd.read_csv('/content/DL-HW3/Snappfood-Dataset/test.csv',sep='\t',index_col=0)

In [None]:
train_df.duplicated().sum()

In [None]:
train_df.head()

In [None]:
train_df['label'].value_counts()

## visualization

In [None]:
! pip install hazm
import hazm

In [None]:
train_df['sent_len']=train_df['comment'].apply(lambda x : len(hazm.word_tokenize(x)))

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.hist(train_df['sent_len'], bins=100)
plt.subplot(1,2,2)
sns.boxenplot(x="label",y="sent_len",data=train_df)
plt.show()

In [None]:
COMMENT_MAX_LEN=50
train_df['sent_len']=train_df['comment'].apply(lambda x : len(hazm.word_tokenize(x)[:COMMENT_MAX_LEN]))

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.hist(train_df['sent_len'], bins=100)
plt.subplot(1,2,2)
sns.boxenplot(x="label",y="sent_len",data=train_df)
plt.show()

### remove comment with less than 3 words

In [None]:
train_df=train_df.drop(train_df[train_df['sent_len'] <= 3].index,axis=0).reset_index(drop=True)


val_df['sent_len']=val_df['comment'].apply(lambda x : len(hazm.word_tokenize(x)))
val_df=val_df.drop(val_df[val_df['sent_len'] <= 3].index,axis=0).reset_index(drop=True)

## preprocessing and clean comments

In [None]:
!pip install finglish

In [None]:
import nltk
import hazm
from finglish import f2p
import re

In [None]:
hazm_normalizer=hazm.Normalizer()

In [None]:
def cleaning(sent):
  wierd_pattern=re.compile("["
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      u"\U0001f926-\U0001f937"
      u'\U00010000-\U0010ffff'
      u"\u200d"
      u"\u2640-\u2642"
      u"\u2600-\u2B55"
      u"\u23cf"
      u"\u23e9"
      u"\u231a"
      u"\u3030"
      u"\ufe0f"
      u"\u2069"
      u"\u2066"
      u"\u200c"
      u"\u2068"
      u"\u2067"
      "]+", flags=re.UNICODE)
  sent = wierd_pattern.sub(r'', sent)

  if(bool(re.match('^[a-zA-Z]',sent))==True):
    sent=f2p(sent)

  sent = re.sub("#", "", sent)
  sent = re.sub("\s+", " ", sent)

  return sent

In [None]:
# import string
# lemmatizer=hazm.Lemmatizer()
# def tokenize(sent):
#     clean=[lemmatizer.lemmatize(word).split("#")[0] for word in hazm.word_tokenize(sent) if (word not in string.punctuation + "٬" + "،")]
#     return ' '.join(clean)

In [None]:
normalizer=hazm.Normalizer()

train_df['comment']= train_df['comment'].apply(lambda x : normalizer.normalize(x))
val_df['comment']= val_df['comment'].apply(lambda x : normalizer.normalize(x))
test_df['comment']= test_df['comment'].apply(lambda x : normalizer.normalize(x))

In [None]:
train_df['comment']= train_df['comment'].apply(cleaning)#.apply(tokenize)
val_df['comment']= val_df['comment'].apply(cleaning)#.apply(tokenize)
test_df['comment']= test_df['comment'].apply(cleaning)#.apply(tokenize)

In [None]:
train_df = train_df.astype({'label_id': 'int', 'label': 'str', 'comment': 'str'})
val_df = val_df.astype({'label_id': 'int', 'label': 'str', 'comment': 'str'})
test_df = test_df.astype({'label_id': 'int', 'label': 'str', 'comment': 'str'})

In [None]:
train_df = train_df[['comment', 'label_id']]
val_df = val_df[['comment', 'label_id']]
test_df = test_df[['comment', 'label_id']]

In [None]:
x_train, y_train = train_df['comment'].values.tolist(), train_df['label_id'].values.tolist()
x_val, y_val = val_df['comment'].values.tolist(), val_df['label_id'].values.tolist()
x_test, y_test = test_df['comment'].values.tolist(), test_df['label_id'].values.tolist()

## BERT requierment libraries

In [None]:
!pip install transformers

In [None]:
from transformers import BertConfig, BertTokenizer,BertModel,AdamW,get_linear_schedule_with_warmup
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import string
import json

from tqdm.notebook import tqdm

## configuration

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

In [None]:
MAX_LEN = 50



EEVERY_EPOCH = 1000

CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {'SAD': 1, 'HAPPY': 0}
id2label = {1: 'SAD', 0: 'HAPPY'}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

## calling for bert tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(MODEL_NAME_OR_PATH, **{'label2id': label2id,'id2label': id2label})
print(config.to_json_string())

## create class to create embedding vec and masked attention vec and ...

In [None]:
class InputExample:
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def make_examples(tokenizer, x, y=None, maxlen=50, output_mode="classification", is_tf_dataset=True):
    examples = []
    y = y if isinstance(y, list) or isinstance(
        y, np.ndarray) else [None] * len(x)

    for i, (_x, _y) in tqdm(enumerate(zip(x, y)), position=0, total=len(x)):
        guid = "%s" % i
        label = int(_y)

        if isinstance(_x, str):
            text_a = _x
            text_b = None
        else:
            assert len(_x) == 2
            text_a = _x[0]
            text_b = _x[1]

        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))

    features = glue_convert_examples_to_features(
        examples,
        tokenizer,
        maxlen,
        output_mode=output_mode,
        label_list=list(np.unique(y)))

    all_input_ids = []
    all_attention_masks = []
    all_token_type_ids = []
    all_labels = []

    for f in tqdm(features, position=0, total=len(examples)):
        if is_tf_dataset:
            all_input_ids.append(tf.constant(f.input_ids))
            all_attention_masks.append(tf.constant(f.attention_mask))
            all_token_type_ids.append(tf.constant(f.token_type_ids))
            all_labels.append(tf.constant(f.label))
        else:
            all_input_ids.append(f.input_ids)
            all_attention_masks.append(f.attention_mask)
            all_token_type_ids.append(f.token_type_ids)
            all_labels.append(f.label)

    if is_tf_dataset:
        dataset = tf.data.Dataset.from_tensor_slices(({
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'token_type_ids': all_token_type_ids
        }, all_labels))

        return dataset, features

    xdata = [np.array(all_input_ids), np.array(all_attention_masks), np.array(all_token_type_ids)]
    ydata = all_labels

    return [xdata, ydata], features

In [None]:
train_dataset_base, train_examples = make_examples(tokenizer, x_train, y_train, maxlen=50)
valid_dataset_base, valid_examples = make_examples(tokenizer, x_val, y_val, maxlen=50)

test_dataset_base, test_examples = make_examples(tokenizer, x_test, y_test, maxlen=50)
[xtest, ytest], test_examples = make_examples(tokenizer, x_test, y_test, maxlen=50, is_tf_dataset=False)

## get train val and test sets

In [None]:
def get_training_dataset(dataset, batch_size):
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(batch_size)

    return dataset

In [None]:
def get_validation_dataset(dataset, batch_size):
    dataset = dataset.batch(batch_size)

    return dataset

In [None]:
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16


train_dataset = get_training_dataset(train_dataset_base, TRAIN_BATCH_SIZE)
valid_dataset = get_training_dataset(valid_dataset_base, VALID_BATCH_SIZE)

train_steps = len(train_examples) // TRAIN_BATCH_SIZE
valid_steps = len(valid_examples) // VALID_BATCH_SIZE

train_steps, valid_steps

## create model

In [None]:
def build_model(model_name, config, learning_rate=.000001):
    model = TFBertForSequenceClassification.from_pretrained(
        model_name, config=config)

    optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    return model

In [None]:
LEARNING_RATE = .00001
model = build_model(MODEL_NAME_OR_PATH, config, learning_rate=LEARNING_RATE)

## learning

In [None]:
EPOCHS = 5

history = model.fit(
    train_dataset,
    validation_data=valid_dataset,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    epochs=EPOCHS,
    verbose=1)

In [None]:
model.save_pretrained(os.path.dirname(OUTPUT_PATH))

## evaluation

In [None]:
from sklearn.metrics import classification_report

ev = model.evaluate(test_dataset_base.batch(TEST_BATCH_SIZE))
print(f'\nEvaluation: {ev}')


predictions = model.predict(xtest)
ypred = predictions[0].argmax(axis=-1).tolist()

print(classification_report(ytest, ypred, target_names=['SAD', 'HAPPY']))

In [None]:
plt.plot(history.history['loss'], 'bo-', label='Train')
plt.plot(history.history['val_loss'], 'ro-', label='Valid')
plt.grid()
plt.legend()
plt.title('Loss')

In [None]:
plt.plot(history.history['accuracy'], 'bo-', label='Train')
plt.plot(history.history['val_accuracy'], 'ro-', label='Valid')
plt.grid()
plt.legend()
plt.title('ACCURACY')

In [None]:
# after you fix the labels of test set:
# this indexes get from the pytorch iimplementation and show the index of records that have flase label_id

x_test_fix= .values.tolist()
y_test_fix= .values.tolist()
[xtest_new, ytest_new], test_examples_new = make_examples(tokenizer, x_test_fix, y_test_fix, maxlen=50, is_tf_dataset=False)

new_predictions = model.predict(xtest_new)
ypred_new = new_predictions[0].argmax(axis=-1).tolist()

print(classification_report(ytest_new, ypred_new, target_names=['SAD', 'HAPPY']))