# Text Visualization using **Scattertext**

### Lib Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scattertext as st
import nltk
import string
import spacy
from nltk.corpus import stopwords
import unicodedata
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/joaocosentino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

For this application we'll be using the "en_core_web_sm" model from spacy's library, used for natural language processing.

In [42]:
modelo = spacy.load("en_core_web_sm")

### Dataset - Corona Virus NLP Tweets, available at: 
https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification

In [28]:
train_data = pd.read_csv("Corona_NLP_train.csv",encoding="latin1")
test_data = pd.read_csv('Corona_NLP_test.csv')

In [4]:
train_data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


Some of the data is not important for this analysis, like UserName, ScreenName.

We'll leave Location and Dates for further analysis regarding location and time lines.

In [29]:
train_data.drop(columns=["UserName","ScreenName"],inplace=True)
train_data.head()

Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
0,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


### Data preprocessing

#### Removing stopwords, lowercase only, punctuation removal.

In [30]:
stop_words = stopwords.words('english')
punctuation = list(string.punctuation)
stp_punc = stop_words+punctuation

train_data["OriginalTweet"] = train_data["OriginalTweet"].apply(lambda x: " ".join(x for x in x.split() if x not in stp_punc)).str.lower()
train_data["OriginalTweet"][0]

'@menyrbie @phil_gahan @chrisitv https://t.co/ifz9fan2pa https://t.co/xx6ghgfzcc https://t.co/i2nlzdxno8'

#### Removing URL information

Useful Regex to remove noise from text corpus

In [31]:
ulr_re = r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*'
mentions = r'@[A-Za-z0-9_]+'
hashtags = r'#[A-Za-z0-9_]+'
empty = r'^\s+|\s+$'

In [32]:
# Creating regex to substitute URLs for blank spaces
# re_sub = re.sub(r'https?:\/\/.*?[\s+]'," ",train_data["OriginalTweet"])
train_data['OriginalTweet'] = train_data["OriginalTweet"].apply(lambda x: re.sub(ulr_re,"",x))
train_data['OriginalTweet']

0                       @menyrbie @phil_gahan @chrisitv   
1        advice talk neighbours family exchange phone n...
2        coronavirus australia: woolworths give elderly...
3        my food stock one empty... please, panic, ther...
4        me, ready go supermarket #covid19 outbreak. no...
                               ...                        
41152    airline pilots offering stock supermarket shel...
41153    response complaint provided citing covid-19 re...
41154    you know itâs getting tough @kameronwilds rat...
41155    is wrong smell hand sanitizer starting turn on...
41156    @tartiicat well new/used rift s going $700.00 ...
Name: OriginalTweet, Length: 41157, dtype: object

#### Removing @mentions, #hashtags and replacing empty comments with NaN

Things like mentions and hashtags are not usefull for categorizing positive and negative comments

In [33]:
train_data['OriginalTweet'] = train_data["OriginalTweet"].apply(lambda x: re.sub(mentions,"",x))
train_data['OriginalTweet'] = train_data["OriginalTweet"].apply(lambda x: re.sub(hashtags,"",x))
train_data['OriginalTweet'] = train_data["OriginalTweet"].apply(lambda x: re.sub(empty,"NaN",x))
train_data.head()

Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
0,London,16-03-2020,,Neutral
1,UK,16-03-2020,advice talk neighbours family exchange phone n...,Positive
2,Vagabonds,16-03-2020,coronavirus australia: woolworths give elderly...,Positive
3,,16-03-2020,"my food stock one empty... please, panic, ther...",Positive
4,,16-03-2020,"me, ready go supermarket outbreak. not i'm pa...",Extremely Negative


#### Removal of Non-ASCII Characters

The removal of non ascii characters depends on the task at hand, for us it shouldn't make relevant difference or loss of information to remove them. 

In [34]:
train_data['OriginalTweet'] = train_data['OriginalTweet'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
train_data

Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
0,London,16-03-2020,,Neutral
1,UK,16-03-2020,advice talk neighbours family exchange phone n...,Positive
2,Vagabonds,16-03-2020,coronavirus australia: woolworths give elderly...,Positive
3,,16-03-2020,"my food stock one empty... please, panic, ther...",Positive
4,,16-03-2020,"me, ready go supermarket outbreak. not i'm pa...",Extremely Negative
...,...,...,...,...
41152,"Wellington City, New Zealand",14-04-2020,airline pilots offering stock supermarket shel...,Neutral
41153,,14-04-2020,response complaint provided citing covid-19 re...,Extremely Negative
41154,,14-04-2020,you know itas getting tough rationing toilet ...,Positive
41155,,14-04-2020,is wrong smell hand sanitizer starting turn on...,Neutral


#### Removal of Emojis

In [35]:
emoji = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

train_data["OriginalTweet"] = train_data["OriginalTweet"].apply(lambda x: emoji.sub(r'',x))
train_data.head()

Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
0,London,16-03-2020,,Neutral
1,UK,16-03-2020,advice talk neighbours family exchange phone n...,Positive
2,Vagabonds,16-03-2020,coronavirus australia: woolworths give elderly...,Positive
3,,16-03-2020,"my food stock one empty... please, panic, ther...",Positive
4,,16-03-2020,"me, ready go supermarket outbreak. not i'm pa...",Extremely Negative


#### Filtering out all NaN values

In [47]:
train_data = train_data[train_data["OriginalTweet"] != "NaN"]
train_data.nunique()

Location         12216
TweetAt             30
OriginalTweet    41048
Sentiment            5
dtype: int64

In [37]:
labeled_tweets = train_data[(train_data["Sentiment"] == "Positive") | (train_data["Sentiment"] == "Negative")]
labeled_tweets

Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
1,UK,16-03-2020,advice talk neighbours family exchange phone n...,Positive
2,Vagabonds,16-03-2020,coronavirus australia: woolworths give elderly...,Positive
3,,16-03-2020,"my food stock one empty... please, panic, ther...",Positive
5,"ÃT: 36.319708,-82.363649",16-03-2020,as news regionas first confirmed covid-19 case...,Positive
6,"35.926541,-78.753267",16-03-2020,cashier grocery store sharing insights to pro...,Positive
...,...,...,...,...
41147,"Brooklyn, NY",14-04-2020,yaall really shitting much home??NaN,Negative
41149,"Toronto, Ontario",14-04-2020,still shocked number supermarket employees wo...,Negative
41150,OHIO,14-04-2020,i never wead situation &amp; world going super...,Positive
41154,,14-04-2020,you know itas getting tough rationing toilet ...,Positive


#### Tokenization

In [38]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()

labeled_tweets['OriginalTweet'] = labeled_tweets["OriginalTweet"].apply(lambda x: tokenizer.tokenize(x))
labeled_tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_tweets['OriginalTweet'] = labeled_tweets["OriginalTweet"].apply(lambda x: tokenizer.tokenize(x))


Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
1,UK,16-03-2020,"[advice, talk, neighbours, family, exchange, p...",Positive
2,Vagabonds,16-03-2020,"[coronavirus, australia:, woolworths, give, el...",Positive
3,,16-03-2020,"[my, food, stock, one, empty..., please,, pani...",Positive
5,"ÃT: 36.319708,-82.363649",16-03-2020,"[as, news, regionas, first, confirmed, covid-1...",Positive
6,"35.926541,-78.753267",16-03-2020,"[cashier, grocery, store, sharing, insights, t...",Positive
...,...,...,...,...
41147,"Brooklyn, NY",14-04-2020,"[yaall, really, shitting, much, home??NaN]",Negative
41149,"Toronto, Ontario",14-04-2020,"[still, shocked, number, supermarket, employee...",Negative
41150,OHIO,14-04-2020,"[i, never, wead, situation, &amp;, world, goin...",Positive
41154,,14-04-2020,"[you, know, itas, getting, tough, rationing, t...",Positive


In [40]:
labeled_tweets['OriginalTweet'] = labeled_tweets.loc[:, 'OriginalTweet'].apply(lambda x: ', '.join(x))
labeled_tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_tweets['OriginalTweet'] = labeled_tweets.loc[:, 'OriginalTweet'].apply(lambda x: ', '.join(x))


Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment
1,UK,16-03-2020,"advice, talk, neighbours, family, exchange, ph...",Positive
2,Vagabonds,16-03-2020,"coronavirus, australia:, woolworths, give, eld...",Positive
3,,16-03-2020,"my, food, stock, one, empty..., please,, panic...",Positive
5,"ÃT: 36.319708,-82.363649",16-03-2020,"as, news, regionas, first, confirmed, covid-19...",Positive
6,"35.926541,-78.753267",16-03-2020,"cashier, grocery, store, sharing, insights, to...",Positive


In [41]:
labeled_tweets.loc[:, 'OriginalTweet']

1        advice, talk, neighbours, family, exchange, ph...
2        coronavirus, australia:, woolworths, give, eld...
3        my, food, stock, one, empty..., please,, panic...
5        as, news, regionas, first, confirmed, covid-19...
6        cashier, grocery, store, sharing, insights, to...
                               ...                        
41147             yaall, really, shitting, much, home??NaN
41149    still, shocked, number, supermarket, employees...
41150    i, never, wead, situation, &amp;, world, going...
41154    you, know, itas, getting, tough, rationing, to...
41156    NaNwell, new/used, rift, s, going, $700.00, am...
Name: OriginalTweet, Length: 21336, dtype: object

In [44]:
labeled_tweets['parsed_tweets'] = labeled_tweets.loc[:,"OriginalTweet"].apply(lambda x: modelo(x))
labeled_tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_tweets['parsed_tweets'] = labeled_tweets.loc[:,"OriginalTweet"].apply(lambda x: modelo(x))


Unnamed: 0,Location,TweetAt,OriginalTweet,Sentiment,parsed_tweets
1,UK,16-03-2020,"advice, talk, neighbours, family, exchange, ph...",Positive,"(advice, ,, talk, ,, neighbours, ,, family, ,,..."
2,Vagabonds,16-03-2020,"coronavirus, australia:, woolworths, give, eld...",Positive,"(coronavirus, ,, australia, :, ,, woolworths, ..."
3,,16-03-2020,"my, food, stock, one, empty..., please,, panic...",Positive,"(my, ,, food, ,, stock, ,, one, ,, empty, ...,..."
5,"ÃT: 36.319708,-82.363649",16-03-2020,"as, news, regionas, first, confirmed, covid-19...",Positive,"(as, ,, news, ,, regionas, ,, first, ,, confir..."
6,"35.926541,-78.753267",16-03-2020,"cashier, grocery, store, sharing, insights, to...",Positive,"(cashier, ,, grocery, ,, store, ,, sharing, ,,..."


### Using Scattertext to Visualize

In [73]:
from IPython.display import IFrame, display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer

corpus = st.CorpusFromPandas(labeled_tweets.iloc[:1000,:],category_col="Sentiment",
                             text_col='parsed_tweets',
                             nlp=modelo).build()

In [75]:
html = st.produce_scattertext_explorer(corpus, category = 'Negative', category_name='Negative',
                                      not_category_name='Positive', minimum_term_frequency=10,
                                      width_in_pixels=1000, transform = st.Scalers.log_scale_standardize)
file_name = 'scattertextdemo_plot.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1000, height = 800)