### Loading the dataset

In [1]:
import json 

In [2]:
d = open ('tweets.json', "r")

In [3]:
data = json.loads(d.read())

### As we can see each tweet consist the unique tweet ID in form of (Keys) so it does'nt make any sense to take them into action 

In [4]:
tweets_list =  []

In [5]:
for key,value in data.items() :
    tweets_list.append(value)

### we're only considering the values for the further processing  

In [6]:
tweets_list[:10]

[{'tweet_author': 'Hematopoiesis News',
  'tweet_text': '‚öïÔ∏è Scientists conducted a Phase II study of acalabrutinib in patients with relapsed/refractory #CLL who were ibrutinib-intolerant, and found an overall response rate of 73%. \nhttps://t.co/eJ6m4QpC5P https://t.co/kuZz6ZO47r'},
 {'tweet_author': 'Michael Wang, MD',
  'tweet_text': 'This phase 2 Acalabrutinib-Venetoclax (AV) trial that is still in recruitment phase will study how well venetoclax and acalabrutinib works in MCL patients who either relapsed or non-respondent to the initial therapy.\n\nhttps://t.co/gg0G9At23N'},
 {'tweet_author': '1stOncology',
  'tweet_text': '#NICE backs #AstraZenecas #Calquence for #CLL https://t.co/Vb5lPDoGrA'},
 {'tweet_author': 'Toby Eyre',
  'tweet_text': '#acalabrutinib is a valuable option in pts intolerant to #ibrutinib. Further valuable data to help decision making in #CLL \n\nEarly View | Haematologica https://t.co/Z2kCLZaX0D'},
 {'tweet_author': 'Lymphoma Hub',
  'tweet_text': 'NICE ha

### Importing required libraries

In [110]:
import numpy as np
import pandas as pd
import itertools
import collections

import warnings
warnings.filterwarnings("ignore")

### converting those tweet values to Dataframe 

In [111]:
df = pd.DataFrame(tweets_list)

In [121]:
df

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,scientists conducted a phase ii study of acala...
1,"Michael Wang, MD",this phase acalabrutinib venetoclax av trial t...
2,1stOncology,nice backs astrazenecas calquence for cll http...
3,Toby Eyre,acalabrutinib is a valuable option in pts into...
4,Lymphoma Hub,nice has recommended the use of acalabrutinib ...
...,...,...
43342,Joy is a Lifestyle,hanging out with friends ff cll happiness http...
43343,ùìíùìªùì≤ùîÉùîÉùîÇ ùìüùìÆùìªùìªùîÇüåπ,hanging out with friends ff cll happiness http...
43344,IQWiG,zusatznutzen von idelalisib ist weder f r cll ...
43345,Medibooks,hematolog a ptk expression and immunochemother...


### importing NLTK toolkit

In [11]:
import nltk
from nltk.corpus import stopwords
import re
import tweepy as tw
import networkx
from nltk.tokenize import word_tokenize
from spacy.lang.en.stop_words import STOP_WORDS
from itertools import filterfalse
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string
from string import punctuation

### As we can clearly see the tweet text consists the HTML tags , punctuations and non alphabetic characters Hyperlinks as well as Emojis 
### Functions to clean all of the above

In [112]:
def cleanHtml(sentence):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(sentence))
    return cleantext

In [113]:
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

In [114]:
def keepAlpha(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

In [115]:
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

In [116]:
df['tweet_text'] = df['tweet_text'].apply(cleanHtml)
df['tweet_text'] = df['tweet_text'].apply(cleanPunc)
df['tweet_text'] = df['tweet_text'].apply(keepAlpha)
df['tweet_text'] = df['tweet_text'].apply(remove_url)
df['tweet_text'] = df['tweet_text'].str.lower()

### tweets after taking out all the Tags, Hyperlinks, punctuations and emojis

In [117]:
df['tweet_text'][0:7]

0    scientists conducted a phase ii study of acala...
1    this phase acalabrutinib venetoclax av trial t...
2    nice backs astrazenecas calquence for cll http...
3    acalabrutinib is a valuable option in pts into...
4    nice has recommended the use of acalabrutinib ...
5    nice backs astrazeneca s calquence for cll htt...
6    this is england for now these decisions usuall...
Name: tweet_text, dtype: object

### removing commonly used words and normalising tweets for better evuluation

In [118]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
punctuation = punctuation + '\n' + '‚Äî' + '‚Äú' + ',' + '‚Äù' + '‚Äò' + '-' + '‚Äô'

In [19]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [20]:
def normalize_tokens(list_of_tokens):
    return map(lambda x: x.lower(),list_of_tokens)

def contractions_expansion(list_of_tokens):
    return map(contracted_word_expansion,list_of_tokens)

In [21]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: word_tokenize(x))
df['tweet_text'] = df['tweet_text'].apply(lambda x: normalize_tokens(x))

### universal regular expression for removal of special and unneccessary blocks

In [22]:
regex = r'^@[a-zA-z0-9]|^#[a-zA-Z0-9]|\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*|\W+|\d+|<("[^"]*"|\'[^\']*\'|[^\'">])*>|_+|[^\u0000-\u007f]+'

In [23]:
def waste_word_or_not(token):
    return re.search(regex,token)

def filter_waste_words(list_of_tokens):
    return filterfalse(waste_word_or_not,list_of_tokens)

def split(list_of_tokens):
    return map(lambda x: re.split(regex,x)[0],list_of_tokens)

In [24]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: filter_waste_words(x))
df['tweet_text'] = df['tweet_text'].apply(lambda x: list(x))

In [25]:
df['tweet_text'][:7]

0    [scientists, conducted, phase, ii, study, acal...
1    [phase, acalabrutinib, venetoclax, av, trial, ...
2    [nice, backs, astrazenecas, calquence, cll, ht...
3    [acalabrutinib, valuable, option, pts, intoler...
4    [nice, recommended, use, acalabrutinib, patien...
5    [nice, backs, astrazeneca, calquence, cll, htt...
6    [england, decisions, usually, come, wales, inf...
Name: tweet_text, dtype: object

### Taking out the repetition of same words from the tweets

In [26]:
en_stop_words = list(set(stopwords.words('english')).union(set(STOP_WORDS)))

In [27]:
def is_stopword(token):
    return not(token in en_stop_words or re.search(r'\b\w\b|[^\u0000-\u007f]+|_+|\W+',token))

def stopwords_removal(list_of_tokens):
    return filter(is_stopword,list_of_tokens)

In [28]:
df['tweet_text'] = df['tweet_text'].apply(lambda x: stopwords_removal(x))
df['tweet_text'] = df['tweet_text'].apply(lambda x: list(x))

### As we can see the tweets are cleaned and in canonical (standard) form

In [29]:
df['tweet_text'][1000:1007]

1000    [myriad, trials, testing, immunomodulating, bl...
1001    [news, chmp, adopts, positive, opinion, acalab...
1002    [according, new, small, study, acalabrutinib, ...
1003    [medscape, hematology, oncology, headlines, gt...
1004    [calquence, recommended, eu, approval, chmp, c...
1005    [study, evaluate, effects, proton, pump, inhib...
1006    [clinicaltrial, study, evaluate, effects, prot...
Name: tweet_text, dtype: object

### converting the tweets list to string as Spacy models only accepts str form 

In [30]:
def listtostring(s):

    text = " "
    return (text.join(s))

In [31]:
df['tweet_text'] = df['tweet_text'].apply(listtostring)

In [32]:
df['tweet_text'][1000:1007]

1000    myriad trials testing immunomodulating blood c...
1001    news chmp adopts positive opinion acalabrutini...
1002    according new small study acalabrutinib counte...
1003    medscape hematology oncology headlines gt gt h...
1004    calquence recommended eu approval chmp chronic...
1005    study evaluate effects proton pump inhibitor a...
1006    clinicaltrial study evaluate effects proton pu...
Name: tweet_text, dtype: object

In [33]:
tweet_text = df['tweet_text'][0:7500]
tweet_token = ''
for i in df['tweet_text'][0:7500]:
    tweet_token += str(i)

### Spacy (NER) Name Entity Recognition is probably the first step towards information extraction that seeks to locate and classify named entities in text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc.

In [34]:
import spacy
  
nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"])
nlp

<spacy.lang.en.English at 0x2c2140dadc0>

### passing the tweet tokens to Spacy model 

In [35]:
doc = nlp(tweet_token)

### here we get the token text , starting char , ending char ,  token vector norm and token labels

In [122]:
for token in doc.ents:
    print(token.text,'---',token.start_char,'---',token.end_char,'---',token.vector_norm,'---',token.label_)

yesterday --- 1150 --- 1159 --- 8.693491 --- DATE
march pm --- 1382 --- 1390 --- 4.9024253 --- TIME
today --- 1692 --- 1697 --- 9.043982 --- DATE
thursday --- 1698 --- 1706 --- 8.113397 --- DATE
daily --- 2138 --- 2143 --- 7.513793 --- DATE
today --- 2204 --- 2209 --- 9.387628 --- DATE
thursday --- 2210 --- 2218 --- 8.742422 --- DATE
recent years --- 8039 --- 8051 --- 5.7938538 --- DATE
tuesday --- 11005 --- 11012 --- 7.676716 --- DATE
second --- 13341 --- 13347 --- 6.919266 --- ORDINAL
second --- 15533 --- 15539 --- 6.934958 --- ORDINAL
secondary --- 20236 --- 20245 --- 8.061173 --- ORDINAL
quarter --- 22677 --- 22684 --- 6.790628 --- DATE
sarahcannondocs --- 26229 --- 26244 --- 6.438276 --- DATE
months --- 27867 --- 27873 --- 7.6612477 --- DATE
months --- 28057 --- 28063 --- 6.818067 --- DATE
months --- 29880 --- 29886 --- 7.493107 --- DATE
seven --- 30005 --- 30010 --- 8.608659 --- CARDINAL
today --- 30151 --- 30156 --- 9.211719 --- DATE
night --- 32188 --- 32193 --- 7.8130746 --- T

###  counting word frequency of each entities 

In [94]:
tup = str(doc.ents) 

In [97]:
wordlist = tup.split()

In [99]:
wordfreq = []
for w in wordlist:
    wordfreq.append(wordlist.count(w))

In [137]:
print("Words--Frequency\n" + str(list(zip(wordlist, wordfreq))))

Words--Frequency
[('(yesterday,', 1), ('march', 1), ('pm,', 11), ('today,', 63), ('thursday,', 8), ('daily,', 5), ('today,', 63), ('thursday,', 8), ('recent', 3), ('years,', 36), ('tuesday,', 10), ('second,', 47), ('second,', 47), ('secondary,', 5), ('quarter,', 2), ('sarahcannondocs,', 2), ('months,', 21), ('months,', 21), ('months,', 21), ('seven,', 2), ('today,', 63), ('night,', 12), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('monday,', 9), ('year,', 37), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('today,', 63), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('years,', 36), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('year,', 37), ('night,', 12), ('year,', 37), ('

In [135]:
submission = pd.DataFrame()
submission['entity'] = wordlist
submission['frequency'] = wordfreq
submission.to_csv('objective1.csv', index=False)

In [132]:
submission

Unnamed: 0,entity,frequency
0,"(yesterday,",1
1,march,1
2,"pm,",11
3,"today,",63
4,"thursday,",8
...,...,...
602,years,3
603,"ago,",6
604,"today,",63
605,"american,",19


### TextBlob performs different operations on textual data such as noun phrase extraction, sentiment analysis, classification, translation, etc

In [140]:
from textblob import TextBlob

In [141]:
def sentiment(x):
    sentiment = TextBlob(x)
    return sentiment.sentiment.polarity

### The sentiment function of textblob returns two properties, polarity, and subjectivity

In [146]:
df['sentiment'] = df['tweet_author'].apply(sentiment)

In [172]:
total_polarity = df['sentiment']

In [204]:
len(total_polarity)

43347

In [199]:
overall_polarity = []
for i in total_polarity:
    if i > 0:
        overall_polarity.append("positive")
    elif i < 0:
        overall_polarity.append('negative')
    else:
        overall_polarity.append("Neutral")

In [205]:
len(overall_polarity)

43347

In [206]:
submission1 = pd.DataFrame()
submission1['author_name'] = df['tweet_author']
submission1['overall_polarity'] = overall_polarity
submission1.to_csv('objective2.csv', index=False)

###  sentiment/polarity of each author

In [208]:
submission1

Unnamed: 0,author_name,overall_polarity
0,Hematopoiesis News,Neutral
1,"Michael Wang, MD",Neutral
2,1stOncology,Neutral
3,Toby Eyre,Neutral
4,Lymphoma Hub,Neutral
...,...,...
43342,Joy is a Lifestyle,positive
43343,ùìíùìªùì≤ùîÉùîÉùîÇ ùìüùìÆùìªùìªùîÇüåπ,Neutral
43344,IQWiG,Neutral
43345,Medibooks,Neutral
