# Exploring Scattertext library capabilities

Here's the documentation for what I'm attempting to adapt: https://github.com/JasonKessler/scattertext

In [1]:
# !pip install scattertext

In [2]:
# !pip install spacy

In [4]:
# !python -m spacy download en_core_web_sm

In [1]:
import scattertext as st
import pandas as pd
import numpy as np
# from pprint import pprint
import spacy
import en_core_web_sm

In [2]:
tweet_csv_files = ['tweets_lovehate_2018.csv',
                   'tweets_happysad_2018.csv',
                   'tweets_music_2018.csv',
                   'tweets_money_2018.csv',
                   'tweets_nowords_2018.csv',
                   'tweets_politics_2018.csv',
                   'tweets_coding_2018.csv']

In [3]:
query_shorthand = 'nowords'
filename = f'tweets_{query_shorthand}_2018.csv'
# reads in the CSV file as a DataFrame
df = pd.read_csv(filename)
df.timestamp = pd.to_datetime(df.timestamp, format='%Y%m%d')  
df = df[df['sentiment'].isin(['positive','negative'])]

In [4]:
df.head()

Unnamed: 0,timestamp,text,sentiment,polarity,subjectivity,tally
0,2018-01-01,I am officially obsessed with black mirror,negative,-0.333333,0.716667,1
1,2018-01-01,"Ack, that's the worst!",negative,-1.0,1.0,1
4,2018-01-01,"Lol, they be wildin tf out.",positive,0.8,0.7,1
5,2018-01-01,Cold Morning,negative,-0.6,1.0,1
6,2018-01-01,Lol. Low IQ is what Obama had. How come Obama ...,positive,0.1,0.433333,1


In [5]:
break       #### comment out this line if you want to run this block
####  NOTE: This block takes about 1-2 hours to execute  ####

# Turn the data frame into a Scattertext Corpus to begin analyzing it. 
nlp = en_core_web_sm.load()
corpus = st.CorpusFromPandas(data_frame=df,
                             category_col='sentiment',
                             text_col='text',
                             nlp=nlp).build()

In [7]:
# Here are the terms that differentiate the corpus from a general English corpus.
list(corpus.get_scaled_f_scores_vs_background().index[:10])

['barometer',
 'lmao',
 'gon',
 'twitter',
 'trump',
 'tweet',
 'humidity',
 'lol',
 'liked',
 'facebook']

In [8]:
# Here are the terms that are most associated with positivity:

term_freq_df = corpus.get_term_freq_df()
term_freq_df['Positivity_Score'] = corpus.get_scaled_f_scores('positive')
list(term_freq_df.sort_values(by='Positivity_Score', ascending=False).index[:10])

['happy birthday',
 'awesome',
 'wonderful',
 'beautiful',
 'the best',
 'to win',
 'proud',
 'proud of',
 'good morning',
 'best']

In [9]:
# ... and here are the terms that are most associated with negativity:

term_freq_df = corpus.get_term_freq_df()
term_freq_df['Negativity_Score'] = corpus.get_scaled_f_scores('negative')
list(term_freq_df.sort_values(by='Negativity_Score', ascending=False).index[:10])

['falling slowly',
 'rising slowly',
 'horrible',
 'disgusting',
 'in falling',
 'slowly',
 'the worst',
 'i hate',
 'awful',
 'evil']

In [31]:
# Create an HTML page for the interactive visualization
html = st.produce_scattertext_explorer(corpus,
    category='positive',
    category_name='Positive',
    not_category_name='Negative',
    # metadata=df['speaker'],
    minimum_term_frequency=25,               ### good value to allow page to load in <2 minutes
    minimum_not_category_term_frequency=25,  ### good value to allow page to load in <2 minutes
#     max_terms=5000,      ### needs to be much higher or else it's a weird graph
    max_snippets=50,
    show_characteristic=True,
    width_in_pixels=1000)
open(f"Tweet_{query_shorthand}_Visualization.html", 'wb').write(html.encode('utf-8'))

32914592

### Saving the object for use in the next session

In [32]:
break   ### comment out this line to run code

# Saving the `corpus` object for later use, so i don't have to build it again
import pickle 
pickle.dump(corpus, open(f"scattertext_{query_shorthand}_corpus.obj", "wb" ))

### Loading the object back in once the next session has begun

In [6]:
break   ### comment out this line to run code

# when I open this notebook again, I can run this code to reload the object
import pickle
corpus = pickle.load(open(f'scattertext_{query_shorthand}_corpus.obj', 'rb'))