In [1]:
import json
import pandas as pd
import numpy as np
import re
import spacy
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import *

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from datetime import datetime

from langdetect import detect
import translators as ts
import translators.server as tss

[nltk_data] Downloading package wordnet to C:\Users\the old
[nltk_data]     one\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\the old
[nltk_data]     one\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\the old one\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Using state Tamil Nadu server backend.


In [2]:

def time_se():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    return (current_time)

# loading data

In [3]:
tweet_df = pd.read_json('tweets.json',orient = 'index')
# datatime is not need for the analysis so dropped
tweet_df.reset_index(drop = True,inplace = True)

In [4]:
tweet_df

Unnamed: 0,tweet_author,tweet_text
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...
...,...,...
43342,Joy is a Lifestyle,Hanging out with Friends! :) #FF #CLL #Happine...
43343,𝓒𝓻𝓲𝔃𝔃𝔂 𝓟𝓮𝓻𝓻𝔂🌹,Hanging out with Friends! :) #FF #CLL #Happine...
43344,IQWiG,Zusatznutzen von #Idelalisib ist weder für #CL...
43345,Medibooks,#Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTH...


# Text cleaning

## steps
1. removing urls
2. removing special characters
3. translating to english
4. removing punctuation
5. stop word removal
6. converting to root words - lemmatization


# Text preprocessing

## 1. removing urls

In [5]:
# removing urls
tweet_df["cleaned_tweet"] = tweet_df['tweet_text'].apply(lambda x: re.sub(r'https?:\/\/\S+', ' ', x))
tweet_df.head()

Unnamed: 0,tweet_author,tweet_text,cleaned_tweet
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,⚕️ Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,This phase 2 Acalabrutinib-Venetoclax (AV) tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,#NICE backs #AstraZenecas #Calquence for #CLL
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,#acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE has recommended the use of acalabrutinib ...


## 2. removing special characters


In [6]:
# removing special characters
tweet_df['cleaned_tweet'] = tweet_df['cleaned_tweet'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
tweet_df.head()

Unnamed: 0,tweet_author,tweet_text,cleaned_tweet
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,This phase 2 Acalabrutinib Venetoclax AV tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,NICE backs AstraZenecas Calquence for CLL
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE has recommended the use of acalabrutinib ...


## 3. translating to english

### translating other languages to english

steps
1. identifying the non english tweets
2. translating to english

### 1. identifying the non english tweets

#### 1. identifying the languages of tweets

In [7]:
def detect_lang(text):
    try:
        return detect(text)
    except: # to handle language not in the corpus
        return text

In [8]:
# detected_text = tweet_df['cleaned_tweet'].apply(detect_lang)

In [9]:
# detected_text = detected_text.to_frame()
# detected_text = detected_text.rename(columns={'cleaned_tweet': 'language_detected'})
# detected_text.to_csv('language_detected_v1.csv')

#### getting tweets which need to be translated

In [10]:
# output of above commented code
detected_text = pd.read_csv('language_detected_v1.csv',index_col = 0)

In [11]:
index_to_translate = detected_text[detected_text['language_detected'] != 'en']
index_to_translate

Unnamed: 0,language_detected
34,ca
43,es
52,
57,de
74,fr
...,...
43333,af
43334,es
43336,es
43344,de


In [12]:
tweets_to_translate = tweet_df.copy(deep = True)

In [13]:
tweets_to_translate = tweets_to_translate.iloc[index_to_translate.index,2]

In [14]:
tweets_to_translate

34       IQVIA RDS Argentina S R L  representa en Argen...
43       Acalabrutinib alcanza el objetivo primario de ...
52                                                        
57       News von     Lesen Sie jetzt den vollständigen...
74         AMMHémato \nL acalabrutinib un nouveau trait...
                               ...                        
43333                My homie  amp  my Bestfriend      CLL
43334    Yo  sinceramente  he empezado este año muchisi...
43336     Hematología STIMULATION OF SURFACE IGM OF CHR...
43344    Zusatznutzen von  Idelalisib ist weder für  CL...
43345     Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTH...
Name: cleaned_tweet, Length: 4064, dtype: object

#### 2. translating to english

In [15]:
def translate(text):
    try:
        return tss.google(text, to_language='en')
    except:
        return text

In [16]:
# start = time_se()
# print(f"start = {start}")

In [17]:
# # translating 
# tweets_to_translate = tweets_to_translate.apply(translate)

In [18]:
# end = time_se()
# print(f'end = {end}')

In [19]:
# tweets_to_translate.to_csv('translated_tweets_v2.csv')

#### replacing translated text in tweet_df

In [20]:
# # output of above commented code
# translated_tweets = pd.read_csv('translated_tweets_v2.csv')
# translated_tweets.set_index('Unnamed: 0',inplace = True)

In [21]:
# tweet_df.update(translated_tweets)

In [22]:
# storing cleaned and translated tweets 
tweet_df.to_csv('cleaned_and_translated_tweets_v3.csv')

## removing missing data

In [23]:
tweet_data = pd.read_csv("cleaned_and_translated_tweets_v3.csv",index_col = 0)

In [24]:
tweet_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43347 entries, 0 to 43346
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tweet_author   43347 non-null  object
 1   tweet_text     43347 non-null  object
 2   cleaned_tweet  43347 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [25]:
tweet_data[tweet_data['cleaned_tweet'].isnull()].head()

Unnamed: 0,tweet_author,tweet_text,cleaned_tweet


In [26]:
# clearing missing values
tweet_data.dropna(subset = ['cleaned_tweet'],inplace = True)

In [27]:
tweet_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43347 entries, 0 to 43346
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tweet_author   43347 non-null  object
 1   tweet_text     43347 non-null  object
 2   cleaned_tweet  43347 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


## 4. removing punctuation

In [28]:
tokenize_text = lambda text: " ".join(nltk.tokenize.RegexpTokenizer('\w+').tokenize(text))

In [29]:
tweet_data['cleaned_tweet'] = tweet_data['cleaned_tweet'].apply(tokenize_text)

In [30]:
tweet_df

Unnamed: 0,tweet_author,tweet_text,cleaned_tweet
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conducted a Phase II study of ac...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,This phase 2 Acalabrutinib Venetoclax AV tri...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,NICE backs AstraZenecas Calquence for CLL
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,acalabrutinib is a valuable option in pts int...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE has recommended the use of acalabrutinib ...
...,...,...,...
43342,Joy is a Lifestyle,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging out with Friends FF CLL Happine...
43343,𝓒𝓻𝓲𝔃𝔃𝔂 𝓟𝓮𝓻𝓻𝔂🌹,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging out with Friends FF CLL Happine...
43344,IQWiG,Zusatznutzen von #Idelalisib ist weder für #CL...,Zusatznutzen von Idelalisib ist weder für CL...
43345,Medibooks,#Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTH...,Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTH...


## 5. stopward removal

In [31]:
stop_words = set(stopwords.words('english'))

In [32]:
tweet_data['cleaned_tweet'] = tweet_data['cleaned_tweet'].map(lambda tweet : " ".join([i for i in tweet.split() if i not in stop_words]))

In [33]:
#Checking the difference in the tweets and clean_tweets
print('----------------------------------original--------------------------------------------')
print(tweet_data['tweet_text'].iloc[-1])
print('----------------------------------cleaned--------------------------------------------')
print(tweet_data['cleaned_tweet'].iloc[-1])

----------------------------------original--------------------------------------------
#Hematología MUTATIONS IN TLR/MYD88 PATHWAY IDENTIFY A SUBSET OF YOUNG CHRONIC LYMPHOCYTIC LEUKEMIA PATIENTS WITH… http://t.co/YzvK2n9UsZ
----------------------------------cleaned--------------------------------------------
Hematología MUTATIONS IN TLR MYD88 PATHWAY IDENTIFY A SUBSET OF YOUNG CHRONIC LYMPHOCYTIC LEUKEMIA PATIENTS WITH


In [34]:
tweet_data

Unnamed: 0,tweet_author,tweet_text,cleaned_tweet
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conducted Phase II study acalabruti...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,This phase 2 Acalabrutinib Venetoclax AV trial...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,NICE backs AstraZenecas Calquence CLL
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,acalabrutinib valuable option pts intolerant i...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE recommended use acalabrutinib patients tr...
...,...,...,...
43342,Joy is a Lifestyle,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging Friends FF CLL Happiness
43343,𝓒𝓻𝓲𝔃𝔃𝔂 𝓟𝓮𝓻𝓻𝔂🌹,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging Friends FF CLL Happiness
43344,IQWiG,Zusatznutzen von #Idelalisib ist weder für #CL...,Zusatznutzen von Idelalisib ist weder für CLL ...
43345,Medibooks,#Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTH...,Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTHE...


## 6. lemmatization

In [35]:
lemmatizer = WordNetLemmatizer()

# convert nltk pos tag to wordnet pos tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wordnet_tagged = [(word, nltk_tag_to_wordnet_tag(tag)) for word, tag in nltk_tagged]
    lemmatized_sentence = [lemmatizer.lemmatize(word, tag) if tag else word for word, tag in wordnet_tagged]
    return ' '.join(lemmatized_sentence)

In [36]:
tweet_data['cleaned_tweet'] = tweet_data['cleaned_tweet'].apply(lemmatize_sentence)

In [37]:
tweet_data

Unnamed: 0,tweet_author,tweet_text,cleaned_tweet
0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...
1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,This phase 2 Acalabrutinib Venetoclax AV trial...
2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,NICE back AstraZenecas Calquence CLL
3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,acalabrutinib valuable option pt intolerant ib...
4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE recommend use acalabrutinib patient treat...
...,...,...,...
43342,Joy is a Lifestyle,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging Friends FF CLL Happiness
43343,𝓒𝓻𝓲𝔃𝔃𝔂 𝓟𝓮𝓻𝓻𝔂🌹,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging Friends FF CLL Happiness
43344,IQWiG,Zusatznutzen von #Idelalisib ist weder für #CL...,Zusatznutzen von Idelalisib ist weder für CLL ...
43345,Medibooks,#Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTH...,Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTHE...


In [38]:
tweet_data.to_csv('final_cleaned_data.csv')

# Entity extraction

In [39]:
tweet_data = pd.read_csv('final_cleaned_data.csv')

In [40]:
NER = spacy.load("en_core_sci_sm\en_core_sci_sm")
def find_entities(text):
    doc = NER(str(text))
    return [ent.text for ent in doc.ents]
td_copy = tweet_data.copy(deep=True)
td_copy["entities"] = td_copy['cleaned_tweet'].apply(find_entities)

In [41]:
# 5:09
# 5:13
find_entities(tweet_data['cleaned_tweet'][43346])

['Hematología MUTATIONS', 'TLR', 'MYD88', 'PATHWAY', 'IDENTIFY', 'PATIENTS']

In [42]:
td_copy

Unnamed: 0.1,Unnamed: 0,tweet_author,tweet_text,cleaned_tweet,entities
0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"[Scientists conduct Phase II study, acalabruti..."
1,1,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...,This phase 2 Acalabrutinib Venetoclax AV trial...,"[phase 2, Acalabrutinib, AV trial, phase study..."
2,2,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...,NICE back AstraZenecas Calquence CLL,"[NICE, CLL]"
3,3,Toby Eyre,#acalabrutinib is a valuable option in pts int...,acalabrutinib valuable option pt intolerant ib...,"[acalabrutinib, intolerant, ibrutinib, data, C..."
4,4,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...,NICE recommend use acalabrutinib patient treat...,"[NICE, acalabrutinib, patient, treatment, Find..."
...,...,...,...,...,...
43342,43342,Joy is a Lifestyle,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging Friends FF CLL Happiness,"[Hanging Friends, FF, CLL]"
43343,43343,𝓒𝓻𝓲𝔃𝔃𝔂 𝓟𝓮𝓻𝓻𝔂🌹,Hanging out with Friends! :) #FF #CLL #Happine...,Hanging Friends FF CLL Happiness,"[Hanging Friends, FF, CLL]"
43344,43344,IQWiG,Zusatznutzen von #Idelalisib ist weder für #CL...,Zusatznutzen von Idelalisib ist weder für CLL ...,"[Zusatznutzen von Idelalisib, CLL, noch, folli..."
43345,43345,Medibooks,#Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTH...,Hematología PTK2 EXPRESSION AND IMMUNOCHEMOTHE...,"[PTK2, EXPRESSION, IMMUNOCHEMOTHERAPY, CHRONIC..."


In [43]:
total_entities=[]
for i in td_copy["entities"]:
    total_entities.extend(i)

In [44]:
entity_freq = pd.Series(total_entities)

In [45]:
entity_freq = entity_freq.value_counts().to_frame().reset_index().rename(columns = {'index':'entity',0:'frequency'})

# objective 1

In [46]:
entity_freq.to_csv('objective1.csv',index=False)

In [47]:
entity_freq = pd.read_csv('objective1.csv')

# sentiment analysis

In [48]:
td_copy2 = td_copy.copy(deep = True)

In [49]:
td_copy2["entity"] = td_copy2.loc[:,"entities"]

In [50]:
td_copy2 = td_copy2.explode("entity")
td_copy2

Unnamed: 0.1,Unnamed: 0,tweet_author,tweet_text,cleaned_tweet,entities,entity
0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"[Scientists conduct Phase II study, acalabruti...",Scientists conduct Phase II study
0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"[Scientists conduct Phase II study, acalabruti...",acalabrutinib
0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"[Scientists conduct Phase II study, acalabruti...",patient
0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"[Scientists conduct Phase II study, acalabruti...",refractory
0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"[Scientists conduct Phase II study, acalabruti...",CLL
...,...,...,...,...,...,...
43346,43346,Medibooks,#Hematología MUTATIONS IN TLR/MYD88 PATHWAY ID...,Hematología MUTATIONS IN TLR MYD88 PATHWAY IDE...,"[Hematología MUTATIONS, TLR, MYD88, PATHWAY, I...",TLR
43346,43346,Medibooks,#Hematología MUTATIONS IN TLR/MYD88 PATHWAY ID...,Hematología MUTATIONS IN TLR MYD88 PATHWAY IDE...,"[Hematología MUTATIONS, TLR, MYD88, PATHWAY, I...",MYD88
43346,43346,Medibooks,#Hematología MUTATIONS IN TLR/MYD88 PATHWAY ID...,Hematología MUTATIONS IN TLR MYD88 PATHWAY IDE...,"[Hematología MUTATIONS, TLR, MYD88, PATHWAY, I...",PATHWAY
43346,43346,Medibooks,#Hematología MUTATIONS IN TLR/MYD88 PATHWAY ID...,Hematología MUTATIONS IN TLR MYD88 PATHWAY IDE...,"[Hematología MUTATIONS, TLR, MYD88, PATHWAY, I...",IDENTIFY


In [51]:
td_copy2.to_csv('tweets_and_entities.csv')

In [52]:
td_copy2 = pd.read_csv('tweets_and_entities.csv')
td_copy2.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tweet_author,tweet_text,cleaned_tweet,entities,entity
0,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",Scientists conduct Phase II study
1,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",acalabrutinib
2,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",patient
3,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",refractory
4,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",CLL


In [53]:
td_copy2.shape

(250907, 7)

In [54]:
# Initialize the sentiment analyzer
SIA = SentimentIntensityAnalyzer()

In [55]:
def sentiment_of_entity(df):
    entity = str(df['entity']).split()
    tweet = str(df['cleaned_tweet'])
    
    # getting tokens
    tokens = word_tokenize(tweet)
    
    # getting entity_index
    entity_index = []
    for i in range(len(tokens)):
        if entity[0] == tokens[i]:
            entity_index.append(i)
            
    # getting words around it
    cropped_text = []
    for i in entity_index:
        cropped_text = tokens[i-5:i+5]
        if len(cropped_text) == 0:
            cropped_text = tokens[i-i:i+5]
    cropped_text = ' '.join(cropped_text)
    # getting sentiment of entity
    sentiment_of_entity = SIA.polarity_scores(cropped_text)
    score = sentiment_of_entity['compound']
    if score == 0:
        return 'Neutral'
    elif score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return (tweet,entity)

In [56]:
td_copy2['sentiment'] = td_copy2.apply(sentiment_of_entity,axis = 1)

In [57]:
x = td_copy2.iloc[1]
print(f'''
author : {x['tweet_author']}

tweet : {x['tweet_text']}

entity : {x['entity']}

sentiment : {x['sentiment']}
''')


author : Hematopoiesis News

tweet : ⚕️ Scientists conducted a Phase II study of acalabrutinib in patients with relapsed/refractory #CLL who were ibrutinib-intolerant, and found an overall response rate of 73%. 
https://t.co/eJ6m4QpC5P https://t.co/kuZz6ZO47r

entity : acalabrutinib

sentiment : Neutral



In [58]:
td_copy2.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tweet_author,tweet_text,cleaned_tweet,entities,entity,sentiment
0,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",Scientists conduct Phase II study,Neutral
1,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",acalabrutinib,Neutral
2,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",patient,Neutral
3,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",refractory,Neutral
4,0,0,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...,Scientists conduct Phase II study acalabrutini...,"['Scientists conduct Phase II study', 'acalabr...",CLL,Neutral


In [59]:
sentiment_for_tweet = td_copy2.loc[:,["entity","tweet_author","sentiment"]]
sentiment_for_tweet.rename(columns={"entity":"entity",
                                    "tweet_author":"author_name",
                                    "sentiment":"overall_polarity"},inplace = True)


In [60]:
sentiment_for_tweet['overall_polarity'].value_counts()

Neutral     149924
Positive     68616
Negative     32367
Name: overall_polarity, dtype: int64

In [61]:
sentiment_for_tweet.to_csv('objective2.csv',index=False)