## Load dependencies

In [5]:
import pandas as pd
import json
import numpy as np
import nltk
import itertools
nltk.download('averaged_perceptron_tagger')
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Load Data

In [6]:
stop_words = set(stopwords.words('english'))
with open('tweets.json') as jfile:
    d = json.load(jfile)

In [7]:
# create pandas dataframe
tweet_id=[]
tweet_author=[]
tweet_text=[]
for key,values in d.items():
    tweet_id.append(key)
    for keys,val in values.items():
        if keys=='tweet_author':
            tweet_author.append(val)
        if keys=='tweet_text':
            tweet_text.append(val)
tweets=np.hstack((np.array(tweet_id).reshape(-1,1),np.array(tweet_author).reshape(-1,1),np.array(tweet_text).reshape(-1,1)))
data_df=pd.DataFrame(tweets,columns=['tweet_ID','tweet_author','tweet_text'])

In [8]:
data_df.head()

Unnamed: 0,tweet_ID,tweet_author,tweet_text
0,1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1,1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
3,1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
4,1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [9]:
data_df['tweet_author'].value_counts()

Patient Power            1603
Paperbirds_Hematology    1510
VJHemOnc                 1079
Oncology Tube             714
Medivizor                 663
                         ... 
Streetwise Reports          1
Onco.com                    1
Investor's Champion         1
21                          1
𝓒𝓻𝓲𝔃𝔃𝔂 𝓟𝓮𝓻𝓻𝔂🌹               1
Name: tweet_author, Length: 9292, dtype: int64

In [10]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [11]:
pre_processed_tweets=[]
for i in data_df['tweet_text']:
  text = re.sub(r"\S*https?:\S*", "", i) 
  text = re.sub('[.,-]','',text)
  text = re.sub(r"[\([{})|:*!?\]]", "", text)
  text = re.sub(r"[\/'';" "\]]", " ", text)
  text = decontracted(text)
  text = text.replace("’s", "")
  text = text.replace("®", "")
  text = text.replace("New article:", "")
  text = text.replace("new", "")
  text = text.replace("Link:", "")
  text=re.sub("[\"\']", "", text)
  text=re.sub("[\\n\']", "", text)
  # remove foreign launguage tweets
  res = [idx for idx in text if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+", idx)]
  text=''.join(res)
  text=re.sub("[\\n\']", "", text)
  text = decontracted(text)
  text=text.strip()
  text=re.sub(r'[0-9]+', '', text)
  pre_processed_tweets.append(text)
  

In [12]:
data_df['tweet_text']=pre_processed_tweets

In [13]:
values=[]
keys=list(set(data_df['tweet_author'].values))
for i in range(len(keys)):
    values.append([])
dictt=dict(zip(keys,values))

## TASK -1

In [14]:
# create pos tags to each tweet and combine simillar 
# parts of speech words that are contigous as a single entity
for i in range(len(data_df)):
    key=data_df['tweet_author'][i]
    a=nltk.tag.pos_tag(data_df['tweet_text'][i].split())
    for index,i in enumerate(a):
        if i[0][0]=='#' or i[0][0]=='@':
          dictt[key].append(i[0][1:])
        elif index==0:
            dictt[key].append(i[0])
            prev=i[1]
        elif i[1]==prev:
            dictt[key][-1]=dictt[key][-1]+' '+i[0]
            prev=i[1]
        else:
            dictt[key].append(i[0])
            prev=i[1]
    

In [15]:
for key,value in dictt.items():
  filtered_sentence = [w.lower() for w in value if not w.lower() in stop_words and len(w)>2]
  dictt[key]=filtered_sentence
  

In [16]:
# To remove repeated words from all tweets by a given author
keys=list(dictt.keys())
for key in keys:
    dictt[key]=list(set(dictt[key]))

In [17]:
word_corpus=list(itertools.chain.from_iterable(list(dictt.values())))
word_corpus_df=pd.DataFrame(word_corpus,columns=['words'])
frequency_of_words=word_corpus_df['words'].value_counts()

In [18]:
objective1 = frequency_of_words.reset_index()
objective1.columns = ['entity', 'frequency']

In [232]:
objective1.to_csv('objective1.csv',index=False)

In [19]:
objective1[:18]

Unnamed: 0,entity,frequency
0,cll,3364
1,leukemia,3081
2,chronic lymphocytic,2615
3,patients,2045
4,acalabrutinib,1416
5,treatment,1069
6,chronic lymphocytic leukemia,1033
7,ibrutinib,789
8,cancer,758
9,calquence,709


## TASK 2

In [2]:
! pip install -q transformers

[K     |████████████████████████████████| 4.7 MB 9.2 MB/s 
[K     |████████████████████████████████| 6.6 MB 58.1 MB/s 
[K     |████████████████████████████████| 120 kB 80.1 MB/s 
[?25h

In [3]:
%load_ext tensorboard
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, LSTM,Flatten
import datetime
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, TFDistilBertModel
from transformers import  DistilBertConfig

In [37]:
distil_bert = 'distilbert-base-uncased' # Name of the pretrained models

config = DistilBertConfig.from_pretrained(distil_bert, output_hidden_states=True)
#DistilBERT 
tokenizer = DistilBertTokenizer.from_pretrained(distil_bert)

model = TFDistilBertModel.from_pretrained(distil_bert, config=config)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_transform', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [4]:
! unzip /content/tweet_dataset.csv.zip

Archive:  /content/tweet_dataset.csv.zip
  inflating: tweet_dataset.csv       


## Twitter sentiment analysis dataset from kaggle

In [21]:
df_general_tweets=pd.read_csv('/content/tweet_dataset.csv.zip')
df_train=df_general_tweets[['text','new_sentiment']]
df_train=df_train.dropna()

In [24]:
processed_tweets=[]
for i in df_train['text']:
  text = re.sub(r"\S*https?:\S*", "", i) 
  text = re.sub('[.,-]','',text)
  text = re.sub(r"[\([{})|:*!?\]]", "", text)
  text = re.sub(r"[\/'';" "\]]", " ", text)
  text = decontracted(text)
  text = text.replace("’s", "")
  text = text.replace("®", "")
  text = text.replace("New article:", "")
  text = text.replace("new", "")
  text = text.replace("Link:", "")
  text=re.sub("[\"\']", "", text)
  text=re.sub("[\\n\']", "", text)
  # remove foreign launguage tweets
  res = [idx for idx in text if not re.findall("[^\u0000-\u05C0\u2100-\u214F]+", idx)]
  text=''.join(res)
  text=re.sub("[\\n\']", "", text)
  text = decontracted(text)
  text=text.strip()
  text=re.sub(r'[0-9]+', '', text)
  processed_tweets.append(text.lower())
  

In [25]:
df_train['text']=processed_tweets

In [26]:
# get max word length from gievn tweets
max_len_general_tweets= 0

for tweet in df_train['text']:
    input_ids = tokenizer.encode(tweet)
    max_len_general_tweets= max(max_len_general_tweets, len(input_ids))       


print('Max sequence length: ', max_len_general_tweets)

Max sequence length:  53


In [38]:
# padding and tokenizing the data
bert_output_embedding=[]
for tweet in df_train['text']:
  encoded_tweet = tokenizer.encode(
                        tweet,                      
                        add_special_tokens = True, 
                        max_length = 53,           
                        padding='max_length',
                        return_attention_mask = True,   
                        return_tensors = 'tf',     
                   )
  output = model(encoded_tweet)
  hidden_states = output[1]
  embedding_output = hidden_states[0]
  attention_hidden_states = hidden_states[1:4]
  embedding=embedding_output[0,0,:]
  temp=[]
  for i in range(3):
    temp.append(attention_hidden_states[i][0,0,:])
  tensor=tf.concat([temp[2],temp[1],temp[0],embedding],axis=0)
  bert_output_embedding.append(tensor)


In [39]:
cls_token_value=np.array(bert_output_embedding)
cls_token_value.shape

(31329, 3072)

In [31]:
y=[]
for i in df_train['new_sentiment'].values:
  if i=='negative':
    y.append(0)
  if i=='positive':
    y.append(1)
  if i=='neutral':
    y.append(2)
y=np.array(y)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(cls_token_value,y ,stratify=y, test_size=0.20,random_state=33)

In [41]:
# Clear any logs from previous runs
! rm -rf ./logs/

In [42]:

#tensorboard callback
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True,write_grads=True)
input_layer=Input(shape=(cls_token_value.shape[1],))
dense1=Dense(384, activation='relu')(input_layer)
dense2=Dense(192, activation='relu')(dense1)
drop1=Dropout(0.2)(dense2)
dense3=Dense(64, activation='sigmoid')(drop1)
output_layer=Dense(3, activation='softmax')(dense3)
model_nn= Model(inputs=[input_layer], outputs=output_layer)
optimizer = tf.keras.optimizers.Adam()
model_nn.compile(optimizer=optimizer, loss='SparseCategoricalCrossentropy',metrics=['accuracy'])
# summarize layers
print(model_nn.summary())




Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3072)]            0         
                                                                 
 dense_4 (Dense)             (None, 384)               1180032   
                                                                 
 dense_5 (Dense)             (None, 192)               73920     
                                                                 
 dropout_39 (Dropout)        (None, 192)               0         
                                                                 
 dense_6 (Dense)             (None, 64)                12352     
                                                                 
 dense_7 (Dense)             (None, 3)                 195       
                                                                 
Total params: 1,266,499
Trainable params: 1,266,499
Non-tra

In [43]:
model_nn.fit(X_train,y_train,epochs=100,validation_data=(X_test,y_test),callbacks=tensorboard_callback)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f05b013a650>

## Sentiment Analysis on tweet.json data

In [44]:
max_len_tweets= 0

for tweet in data_df['tweet_text']:
    input_ids = tokenizer.encode(tweet)
    max_len_general_tweets= max(max_len_tweets, len(input_ids))       


print('Max sequence length: ', max_len_general_tweets)

Max sequence length:  28


In [47]:
bert_output_embedding=[]
for tweet in data_df['tweet_text']:
  encoded_tweet = tokenizer.encode(
                        tweet,                      
                        add_special_tokens = True, 
                        max_length = 28,           
                        padding='max_length',
                        return_attention_mask = True,   
                        return_tensors = 'tf',     
                   )
  output = model(encoded_tweet)
  hidden_states = output[1]
  embedding_output = hidden_states[0]
  attention_hidden_states = hidden_states[1:4]
  embedding=embedding_output[0,0,:]
  temp=[]
  for i in range(3):
    temp.append(attention_hidden_states[i][0,0,:])
  tensor=tf.concat([temp[2],temp[1],temp[0],embedding],axis=0)
  bert_output_embedding.append(tensor)


In [48]:
cls_token_value=np.array(bert_output_embedding)
cls_token_value.shape

(43347, 3072)

In [49]:
y_pred=model_nn.predict(cls_token_value)
objective2=np.argmax(y_pred,axis=1)

In [68]:
sentiment=[]
for i in objective2:
  if i==0:
    sentiment.append('negative')
  elif i==1:
    sentiment.append('positive')
  elif i==2:
    sentiment.append('neutral')

In [70]:
data_df['sentiment']=sentiment

In [71]:
objective1.to_csv('objective1.csv',index=False)

Unnamed: 0,tweet_ID,tweet_author,tweet_text,sentiment
0,1374140386071961602,Hematopoiesis News,Scientists conducted a Phase II study of acala...,neutral
1,1374032432173842437,"Michael Wang, MD",This phase AcalabrutinibVenetoclax AV trial t...,neutral
2,1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL,neutral
3,1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...,neutral
4,1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,positive


In [163]:
data_temp=data_df[['tweet_author','tweet_text']]

In [156]:
import spacy
sp = spacy.load('en_core_web_sm')

In [220]:
tweet_author=[]
entity=[]
for row in data_temp.values:
  author=row[0]
  textt=row[1]
  sen = sp(textt)
  entities=sen.ents
  tweet_author.append(author)
  if len(entities)==1:
    entity.append(entities[0].text)
  elif len(entities)>1:
    string=entities[0].text
    for i in range(1,len(entities)):
      string=string+'_'+entities[i].text
    entity.append(string)
  else:
     entity.append(entities)
     

In [228]:
objective2=pd.DataFrame({'entity':np.array(entity),'tweet_author':np.array(tweet_author),'sentiment':data_df['sentiment'].values})

  """Entry point for launching an IPython kernel.


In [230]:
objective2.to_csv('objective2.csv',index=False)

## Observations
1) considering bert final hidden state embedding only 
have high training accuracy but validation accuracy is 0.9 and 0.74<br>
2) considering bert final hidden state and last 3 hiddne sate outputs and concatenate them to get final embedding vector have  training accuracy and validation accuracy is close to 0.84 and 0.81<br>