# Sentiment Extractor
------
**AIM**: *To extract the phrases that contribute to the sentiment of a tweet*

In [1]:
# Started on 13th October 2021 at 3:45pm
# by kodooraKILLER

## 0. Import
------
Tasks performed here:
1. Importing necessary libraries
2. Importing dataset

In [2]:
#!pip install pyspellchecker
#!pip install 'torch==1.9.1' --force-reinstall
!pip install transformers
# import all necessary libraries

#For tensor-processing
import tensorflow

# For dataframes
import pandas as pd 

# For numerical arrays
import numpy as np 

# For stemming/Lemmatisation/POS tagging
import spacy

# For getting stopwords
from spacy.lang.en.stop_words import STOP_WORDS

# For K-Fold cross validation
from sklearn.model_selection import KFold

# For visualizations
import matplotlib.pyplot as plt

# For regular expressions
import re

# For handling string
import string

# For all torch-supported actions
import torch

# For spell-check
#from spellchecker import SpellChecker

# For performing mathematical operations
import math

# For dictionary related activites
from collections import defaultdict

# For counting actions (EDA)
from collections import  Counter

# For count vectorisation (EDA)
from sklearn.feature_extraction.text import CountVectorizer

# For one-hot encoding
from tensorflow.keras.utils import to_categorical

# For DL model
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from tensorflow.keras.models import Model, Sequential

# For generating random integers
from random import randint

# For TF-IDF vectorisation
from sklearn.feature_extraction.text import TfidfVectorizer

# For padding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# For tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

#for progress bars
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

# For plotting
import seaborn as sns

# For word-cloud 
from wordcloud import WordCloud

# For transformers pipeline
from transformers import pipeline

print("Necessary libraries imported")

# Constant variables 

# Ignore chain assignment warnings
pd.options.mode.chained_assignment = None

# spaCy language lemmatiser model
sp=spacy.load('en_core_web_sm')
#spell = SpellChecker()

# BERT-LARGE-UNCASED TRANSFORMER and TOKENIZER
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering


MODEL_DIR = "../input/huggingface-bert/"

bert_checkpoint="bert-large-uncased-whole-word-masking-finetuned-squad"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)
bert_model = TFAutoModelForQuestionAnswering.from_pretrained(bert_checkpoint)

print("Constant variables ready")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
#TRAIN DATA STRUCTURE:
df=pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
df.head()

In [None]:
#TEST DATA STRUCTURE:
test_df=pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
test_df.tail()

## 1. Exploratory Data Analysis
-------
EDA aims to get a proper idea of the datasets given before getting into the model building process. The following are performed here
- Nullness analysis
- Substring presence in main string

### 1.1 Nullness analysis

In [None]:
print('Total inputs: ',len(df))
df.isnull().sum()

In [None]:
df[df['text'].isnull()]

#### Inference
There is only **one `NaN` input** which also has an `NaN` selected_text, hence it can be comfortably treated as an empty string or can be removed

### 1.2 Substring presence

In [None]:
count=0
for index,row in df.iterrows():
    if(str(row['text']).find(str(row['selected_text']))!= -1):
        count+=1
print('Percentage of data extracted directly from main text: ',count*100/len(df))

#### Inference
It is very clear that the **answer text needs to be extracted from the given text**, signifying the fact that it is not possible to remove or clean the input text

### 1.3 Categories of Sentiment
We see the different categories of sentiment, and the associated value count of each of them

In [None]:
plt.rcParams['figure.figsize'] = [8, 8]
sns_count = sns.countplot(df['sentiment'], data = df, order = df['sentiment'].value_counts().index)

#### Inference
The dataset is **not** unbiased, with a bit of skewness towards neutral tweets

### 1.4 Total words trend and average character-per-word trend
The aim of this segment is to plot the total words used for each category in a given corpus, and also to observe the average word length

In [None]:
def total_words(text):
    return len(text.split())
def avg_word_length(text):
    return sum([len(x) for x in text.split()])/len(text.split())

df['total_words']=df.text.apply(lambda x: total_words(str(x)))
df['avg_word_length']=df.text.apply(lambda x: avg_word_length(str(x)))
df.head()

In [None]:
seaborn=sns.stripplot(data=df,
    x="sentiment", y="total_words")
seaborn.set(xlabel = 'Category', ylabel = 'no. of words', title = 'Spread of no. of words used in all contexts')

In [None]:
seaborn=sns.stripplot(data=df,
    x="sentiment", y="avg_word_length")
seaborn.set(xlabel = 'Category', ylabel = 'avg word length', title = 'Average characters-per-word for all contexts"')

#### Inference
- Total words used in a given context varies from 1 till 30+, denoting the spread of number of words used in the context
- Average character length is tightly packed around the 1-15 region, with a few outliers caused due to noise in the tweets-dataset

*in conclusion, it can be said that length of words and number of words used do not display any observable trend and hence can be safely ignored.*

### 1.5 Unigram analysis

In [None]:
TOP_HOW_MANY=20
print('Top ', TOP_HOW_MANY, 'common words in each sentiment category')
fig, axes = plt.subplots(1, 3, figsize=(24,8))
index=0
for sentiment in df.sentiment.unique():
  dct=defaultdict(int) 
  curdf=df[df['sentiment']==sentiment]  
  curdf["text"]=curdf.text.apply(lambda x: str(x))

  #curdf.loc[:,"text"]=curdf["text"].apply(lambda x: str(x))
  counter=Counter(" ".join(curdf.text).split())
  most=counter.most_common()
  x=[]
  y=[]
  for word,count in most[:TOP_HOW_MANY]:
      if (True):
          x.append(word)
          y.append(count)
  sns.barplot(ax=axes[index%3],x=y,y=x)
  axes[index%3].set_title(sentiment)
  index+=1

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.3)
fig.show()

In [None]:
TOP_HOW_MANY=50
print('Top common non-stop-words in each sentiment category')
fig, axes = plt.subplots(1, 3, figsize=(24,8))
index=0
for sentiment in df.sentiment.unique():
  dct=defaultdict(int) 
  curdf=df[df['sentiment']==sentiment]  
  curdf["text"]=curdf.text.apply(lambda x: str(x))

  #curdf.loc[:,"text"]=curdf["text"].apply(lambda x: str(x))
  counter=Counter(" ".join(curdf.text).split())
  most=counter.most_common()
  x=[]
  y=[]
  for word,count in most[:TOP_HOW_MANY]:
      if (word not in STOP_WORDS):
          x.append(word)
          y.append(count)
  sns.barplot(ax=axes[index%3],x=y,y=x)
  axes[index%3].set_title(sentiment)
  index+=1

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.3)
fig.show()

#### Inference
- There are a lot of stopwords in all three sentiment-categories
- Words that correlate with a sentiment category are more popular and commonly used within that category

### 1.6 N-Gram analysis

In [None]:
def get_top_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

print("Bigram analysis")

fig, axes = plt.subplots(1,3, figsize=(24,8))
fig.suptitle('Bigram analysis')
index=0
for sentiment in df.sentiment.unique():
  dct=defaultdict(int) 
  curdf=df[df['sentiment']==sentiment]
  curdf["text"]=curdf.text.apply(lambda x: str(x))
  top_bigrams=get_top_bigrams(curdf.text)[:10]
  x,y=map(list,zip(*top_bigrams))
  sns.barplot(ax=axes[index%3],x=y,y=x)
  axes[index%3].set_title(sentiment)
  index+=1
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.3)
fig.show()

In [None]:
def get_top_trigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

print("Trigram analysis")

fig, axes = plt.subplots(1,3, figsize=(24,8))
fig.suptitle('Trigram analysis')
index=0
for sentiment in df.sentiment.unique():
  dct=defaultdict(int) 
  curdf=df[df['sentiment']==sentiment]
  curdf["text"]=curdf.text.apply(lambda x: str(x))
  top_bigrams=get_top_trigrams(curdf.text)[:10]
  x,y=map(list,zip(*top_bigrams))
  sns.barplot(ax=axes[index%3],x=y,y=x)
  axes[index%3].set_title(sentiment)
  index+=1
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.3, 
                    hspace=0.3)
fig.show()

#### Inference
- Both bigrams and trigrams are affected by the presence of stopwords and other undesirable text-chunks like HTTP-based URLs
- The most popular N-grams within a sentiment category are the phrases that tend to reflect the emotional sentiment of that particular category

### 1.7 Word-Cloud

In [None]:
# Word cloud of the text with the positive sentiment
df_pos = df.loc[df.sentiment == 'positive', 'text']
print("WordClouds for each category")

fig = plt.figure(figsize=(24,8))
for i in range(len(df.sentiment.unique())):
    sentiment=df.sentiment.unique()[i]
    ax = fig.add_subplot(1,3,i+1)
    curdf = df.loc[df.sentiment == sentiment]
    curdf.text=curdf.text.apply(lambda x: str(x))
    k = (' '.join(" ".join(curdf.text).split()))
    wordcloud = WordCloud(width = 1000, height = 500, background_color = 'white').generate(k)
    ax.set_title(sentiment)
    ax.imshow(wordcloud)
    ax.axis('off')

#### Inference
As seen in Unigram and N-gram analysis, the word-cloud also shows correlation between common words and sentiment of a particular category

## 2. Data Cleaning
-------
From EDA, we understand that
- There is only one null value row, which needs to be replaced with empty string [`Inference 1.1`]
- Although the textual context is too noisy with stopwords, URLs, etc (concluded from `Inference 1.5, 1.6 and 1.7`, Since the problem statement expects the exact answer, there can be no additional cleaning/treatment performed [`Inference 1.2`]

Hence, our only motive in data cleaning step is to **treat NaN**

In [4]:
df = df.dropna(how='any',axis=0).reset_index(drop=True)
df.isnull().sum()

# 3. Feature Engineering
--------
The following processes are done here:
- strip the spaces of all fields
- tokenize using bert tokenizer (Format: `CLS-SENTIMENT-SEP-CONTEXT-SEP-PAD...`)
- Find starting and ending character-indexes of `selected_text`
- Convert it to input tensor X (Format: `[ input_ids[], attention_mask[], token_type_ids[] ]` and output tensor Y (Format: `[ start_position[], end_position[] ]`


In [None]:
tokenized= bert_tokenizer(
        list(df["sentiment"]),
        list(df["text"]),
        padding="max_length")

In [None]:
tokenized= bert_tokenizer(
        ["hi","en ho"],
        ["world helo","fyi bc"])
for i in range(10):
    print(tokenized.char_to_word(1,i))
    

In [16]:
from transformers import DistilBertTokenizerFast
dbert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def feature_process_df(df,tokenizer,answers_given=True):
    df.sentiment=df.sentiment.apply(lambda x:str(x).strip())
    df.text=df.text.apply(lambda x:str(x).strip())
    df.selected_text=df.selected_text.apply(lambda x:str(x).strip())
    tokenized= tokenizer(
        list(df["text"]),
        list(df["sentiment"]),
        padding="max_length")
    if answers_given:
        tokenized["start_positions"] = []
        tokenized["end_positions"] = []
        tokenized["answer_start"] = []
        tokenized["answer_end"] = []
        for index,row in df.iterrows():
            start_idx=str(row['text']).find(str(row['selected_text']))
            tokenized["answer_start"].append(start_idx)
            tokenized["answer_end"].append(start_idx+len(row["selected_text"]))
            tokenized["start_positions"].append(tokenized.char_to_token(index,tokenized["answer_start"][-1]))
            tokenized["end_positions"].append(tokenized.char_to_token(index,tokenized["answer_end"][-1]-1))
            if tokenized["start_positions"][-1] is None:
                tokenized["start_positions"][-1] = tokenizer.model_max_length
            if tokenized["end_positions"][-1] is None:
                tokenized["end_positions"][-1] = tokenizer.model_max_length
        tokenized["start_logits"] = tokenized["start_positions"]
        tokenized["end_logits"] = tokenized["end_positions"]
    
    return tokenized
            
    '''
    all_input_ids_tensor = tf.convert_to_tensor(tokenized_examples["input_ids"])
    all_token_type_ids_tensor = tf.convert_to_tensor(tokenized_examples["token_type_ids"])
    all_attention_mask_tensor = tf.convert_to_tensor(tokenized_examples["attention_mask"])
    all_start_pos_tensor = tf.convert_to_tensor(tokenized_examples["start_positions"])
    all_end_pos_tensor = tf.convert_to_tensor(tokenized_examples["end_positions"])
    features = {'input_ids': all_input_ids_tensor, 'token_type_ids': all_token_type_ids_tensor,
                    'attention_mask': all_attention_mask_tensor}
    labels = {"output_1": all_start_pos_tensor, 'output_2': all_end_pos_tensor}
    return features, labels
    
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
features, labels = feature_process_df(df,bert_tokenizer)
'''
#tfdataset = tf.data.Dataset.from_tensor_slices((features, labels)).batch(8)
tokenized_encodings=feature_process_df(df,dbert_tokenizer)
print('DF tokenizer ready')

In [17]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices((
    {key: tokenized_encodings[key] for key in ['input_ids', 'attention_mask']},
    {key: tokenized_encodings[key] for key in ['start_logits', 'end_logits']}
))

In [12]:
train_dataset = train_dataset.map(lambda x, y: (x, (y['start_positions'], y['end_positions'])))

## 4. BERT Training
-------

In [18]:
from transformers import TFDistilBertForQuestionAnswering
dbert_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [19]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

dbert_model.distilbert.return_eagerly = True
dbert_model.distilbert.return_dict = False
dbert_model.compile(optimizer=optimizer, loss=loss) # can also use any keras loss fn
dbert_model.fit(train_dataset.shuffle(1000).batch(16), epochs=5, batch_size=16)

In [21]:
dbert_model.save_weights("dbert_qa.h5")