## Importing Libraries and data

In [35]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
import spacy
import string
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
with open('tweets.json') as jfile:
    d = json.load(jfile)

In [4]:
df = pd.DataFrame(d)
df = df.T

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43347 entries, 1374140386071961602 to 550579446537678849
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet_author  43347 non-null  object
 1   tweet_text    43347 non-null  object
dtypes: object(2)
memory usage: 1015.9+ KB


In [6]:
pd.set_option('display.max_columns', None)

In [7]:
df['tweet_text'].values[20]

'NICE backs AstraZeneca’s Calquence for CLL https://t.co/FTEaEMFSW7 #pharma #lifesciences'

In [8]:
df.head()

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


## Data_Cleaning

In [9]:
#checking for null values
df.isna().sum()

tweet_author    0
tweet_text      0
dtype: int64

In [10]:
df.head()

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [11]:
#checking for duplicate rows
df.duplicated().sum()

1529

In [12]:
#dropping the duplicates
df.drop_duplicates(inplace = True)

In [13]:
# Here this Index is not needed so reset_index
df.reset_index(inplace = True,drop = True)

In [14]:
df.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...


In [15]:
# text_cleaning
text = df['tweet_text']

text = text.str.lower()   #lowering_letters
text

0        ⚕️ scientists conducted a phase ii study of ac...
1        this phase 2 acalabrutinib-venetoclax (av) tri...
2        #nice backs #astrazenecas #calquence for #cll ...
3        #acalabrutinib is a valuable option in pts int...
4        nice has recommended the use of acalabrutinib ...
                               ...                        
41813    what i'd do to go to gerrard's last game at an...
41814    hanging out with friends! :) #ff #cll #happine...
41815    zusatznutzen von #idelalisib ist weder für #cl...
41816    #hematología ptk2 expression and immunochemoth...
41817    #hematología mutations in tlr/myd88 pathway id...
Name: tweet_text, Length: 41818, dtype: object

In [16]:
def cleaning_URLs(data):
    return re.sub(r'http\S+', '', data)
text = text.apply(lambda x: cleaning_URLs(x))
text.tail()

41813    what i'd do to go to gerrard's last game at an...
41814    hanging out with friends! :) #ff #cll #happiness 
41815    zusatznutzen von #idelalisib ist weder für #cl...
41816    #hematología ptk2 expression and immunochemoth...
41817    #hematología mutations in tlr/myd88 pathway id...
Name: tweet_text, dtype: object

In [17]:
def clean_meantion_hashtag(t):
    temp = re.sub("@[A-Za-z0-9_]+","", t)
    temp1 = re.sub("#[A-Za-z0-9_]+","", temp)
    return temp1
text = text.apply(lambda x : clean_meantion_hashtag(x))
text.tail()

41813    what i'd do to go to gerrard's last game at an...
41814                     hanging out with friends! :)    
41815    zusatznutzen von  ist weder für  noch für refr...
41816    ía ptk2 expression and immunochemotherapy outc...
41817    ía mutations in tlr/myd88 pathway identify a s...
Name: tweet_text, dtype: object

In [18]:
#remove lines-->'\n'
def remove_newline(data):
    return re.sub('\s+',' ',data)
text = text.apply(lambda x : remove_newline(x))

In [19]:
#remove_emojis,numbers.
text = text.apply(lambda x: re.sub('[^a-z]+',' ',x))
text.head()

0     scientists conducted a phase ii study of acal...
1    this phase acalabrutinib venetoclax av trial t...
2                                           backs for 
3     is a valuable option in pts intolerant to fur...
4    nice has recommended the use of acalabrutinib ...
Name: tweet_text, dtype: object

In [20]:
alphabets = list('abcdefghijklmnopqrstuvwxyz')

In [24]:
lemma = WordNetLemmatizer()
def cleaning_stopwordsand_lemmatize(text):
    return " ".join([lemma.lemmatize(word) for word in text.split() if word not in set(stopwords.words('english')+alphabets)])
text = text.apply(lambda text: cleaning_stopwordsand_lemmatize(text))
text.head()

0    scientist conducted phase ii study acalabrutin...
1    phase acalabrutinib venetoclax av trial still ...
2                                                 back
3    valuable option pt intolerant valuable data he...
4    nice recommended use acalabrutinib patient tre...
Name: tweet_text, dtype: object

## OBJECTIVE-1

In [26]:
cv =  CountVectorizer(ngram_range=(2,3))
obj1 = cv.fit(text)
x = obj1.vocabulary_

In [27]:
objective1 = pd.DataFrame(x.items(),columns = ['entity','frequency'])
objective1.sort_values(by = 'frequency',ascending = False,inplace = True,ignore_index = True)
objective1.head()

Unnamed: 0,entity,frequency
0,zytuxtm newly diagnosed,349060
1,zytuxtm newly,349059
2,zytux iranian rituximab,349058
3,zytux iranian,349057
4,zytostatikum verbessert therapieergebnisse,349056


In [28]:
objective1.to_csv('objective1.csv')

## OBJECTIVE-2

In [29]:
df['tweet_text'] = text.values

In [31]:

df['tweet_text'] = df['tweet_text'].apply(lambda x : [x])

In [32]:
df.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,[scientist conducted phase ii study acalabruti...
1,"Michael Wang, MD",[phase acalabrutinib venetoclax av trial still...
2,1stOncology,[back]
3,Toby Eyre,[valuable option pt intolerant valuable data h...
4,Lymphoma Hub,[nice recommended use acalabrutinib patient tr...


In [33]:
['x']+['s']

['x', 's']

In [34]:
df1 = df.groupby('tweet_author')['tweet_text'].sum()
df1 = pd.DataFrame(df1.reset_index())
df1.head(10)

Unnamed: 0,tweet_author,tweet_text
0,Camilla White,"[pleotropic activity, ]"
1,Emilie Thompson,"[pleotropic activity, ]"
2,Hannah Wright,"[pleotropic activity, ]"
3,Yvianna ,[saudade]
4,#DestroyTheAadhaar TwiLightOFTheGODS,[uk hospital trial five new drug search corona...
5,#Endsars protest,[medical news today outlook chronic lymphocyti...
6,#Enritchansrajpandey,[good morning exclusive morning market news am...
7,#KING OF FLORIDA 👑,[venetoclax powerful new kind cancer drug effe...
8,#LeoWeichafe,[esperando que comience la lucha ac en]
9,#Memory,[antitumor potency anti cd chimeric antigen re...


In [39]:
cv = CountVectorizer(ngram_range= (2,3))
def cvtransform(data):
    try:
        x = cv.fit(data)
        return x.get_feature_names_out()

    except:
        return data

df['tweet_text'] = df['tweet_text'].apply(lambda x: cvtransform(x))

In [45]:
df = df.explode('tweet_text').reset_index(drop = True)
df.head()

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,acalabrutinib patient
1,Hematopoiesis News,acalabrutinib patient relapsed
2,Hematopoiesis News,conducted phase
3,Hematopoiesis News,conducted phase ii
4,Hematopoiesis News,found overall


In [53]:
sia = SentimentIntensityAnalyzer()    
def polarity(data):
    if  sia.polarity_scores(data)['compound'] > 0:
        return 'Positive'
    elif sia.polarity_scores(data)['compound'] == 0:
        return 'Neutral'
    else:
        return 'Negative'

df['compound'] = df['tweet_text'].apply(lambda x : polarity(x))

In [60]:
df.columns = ['author_name','entity','overall_polarity']

In [63]:
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df
swap_columns(df,'author_name','entity')

Unnamed: 0,entity,author_name,overall_polarity
0,acalabrutinib patient,Hematopoiesis News,Neutral
1,acalabrutinib patient relapsed,Hematopoiesis News,Neutral
2,conducted phase,Hematopoiesis News,Neutral
3,conducted phase ii,Hematopoiesis News,Neutral
4,found overall,Hematopoiesis News,Neutral
...,...,...,...
766544,subset young chronic,Medibooks,Neutral
766545,tlr myd,Medibooks,Neutral
766546,tlr myd pathway,Medibooks,Neutral
766547,young chronic,Medibooks,Neutral


In [65]:
df.to_csv('objective-2.csv')