# ScatterText

I dati che voglio rappresentare riguardano i discorsi tenuti dai vari candidati nelle elezioni americane del 2012, con lo scopo di evidenziare le parole usate più frequentemente dai candidati Repubblicani e Democratici

In [4]:
%matplotlib inline

import scattertext as st
from pprint import pprint
import spacy
import pandas as pd
import numpy as np
from IPython.display import IFrame
from scattertext import SampleCorpora, produce_scattertext_explorer
from scattertext.CorpusFromPandas import CorpusFromPandas
from scattertext.WhitespaceNLP import whitespace_nlp

Importo i dati in un DataFrame, poi faccio un parsing per contare i discorsi di candidati democratici e repubblicani

In [5]:
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df.head()

Unnamed: 0,party,text,speaker
0,democrat,Thank you. Thank you. Thank you. Thank you so ...,BARACK OBAMA
1,democrat,"Thank you so much. Tonight, I am so thrilled a...",MICHELLE OBAMA
2,democrat,Thank you. It is a singular honor to be here t...,RICHARD DURBIN
3,democrat,"Hey, Delaware. \nAnd my favorite Democrat, Jil...",JOSEPH BIDEN
4,democrat,"Hello. \nThank you, Angie. I'm so proud of how...",JILL BIDEN


In [6]:
nlp = st.WhitespaceNLP.whitespace_nlp
corpus = st.CorpusFromPandas(convention_df, 
                             category_col='party', 
                             text_col='text',
                             nlp=nlp).build()

convention_df['parsed'] = convention_df.text.apply(nlp)
convention_df.head()

Unnamed: 0,party,text,speaker,parsed
0,democrat,Thank you. Thank you. Thank you. Thank you so ...,BARACK OBAMA,"(thank, , you, ., , thank, , you, ., , tha..."
1,democrat,"Thank you so much. Tonight, I am so thrilled a...",MICHELLE OBAMA,"(thank, , you, , so, , much, ., , tonight,..."
2,democrat,Thank you. It is a singular honor to be here t...,RICHARD DURBIN,"(thank, , you, ., , it, , is, , a, , sing..."
3,democrat,"Hey, Delaware. \nAnd my favorite Democrat, Jil...",JOSEPH BIDEN,"(hey, ,, , delaware, ., , \n, and, , my, ,..."
4,democrat,"Hello. \nThank you, Angie. I'm so proud of how...",JILL BIDEN,"(hello, ., , \n, thank, , you, ,, , angie, ..."


In [7]:
print("Document Count")
print(convention_df.groupby('party')['text'].count())

Document Count
party
democrat      123
republican     66
Name: text, dtype: int64


Termini usati più frequentemente dai Democratici

In [8]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Democratic Score'] = corpus.get_scaled_f_scores('democrat')
pprint(list(term_freq_df.sort_values(by='Democratic Score', ascending=False).index[:10]))

['middle class',
 'forward',
 'class',
 'middle',
 'the middle',
 'medicare',
 'pay',
 'education',
 'health',
 'president obama']


Termini usati più frequentemente dai Repubblicani

In [9]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Republican Score'] = corpus.get_scaled_f_scores('republican')
pprint(list(term_freq_df.sort_values(by='Republican Score', ascending=False).index[:10]))

['government',
 'administration',
 'can do',
 'business',
 'unemployment',
 'success',
 'story',
 'freedom',
 'do better',
 'paul']


In [10]:
html = st.produce_scattertext_explorer(corpus,
         category='democrat',
         category_name='Democratic',
         not_category_name='Republican',
         width_in_pixels=1000,
         metadata=convention_df['speaker'])
open("Convention-Visualization.html", 'wb').write(html.encode('utf-8'))
IFrame(src="Convention-Visualization.html", width = 900, height=800)

Voglio visualizzare i principali argomenti invece delle parole
FeatsFromOnlyEmpath estrae solo le features importanti, caratterizzando un documento con l'argomento di cui parla

In [11]:
from scattertext import FeatsFromOnlyEmpath

feat_builder = st.FeatsFromOnlyEmpath()
empath_corpus = st.CorpusFromParsedDocuments(convention_df,
                                             category_col='party',
                                             feats_from_spacy_doc=feat_builder,
                                             parsed_col='text').build()
html = st.produce_scattertext_explorer(empath_corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                        not_category_name='Republican',
                                       width_in_pixels=1000,
                                       metadata=convention_df['speaker'],
                                       use_non_text_features=True,
                                       use_full_doc=True,
                                       topic_model_term_lists=feat_builder.get_top_model_term_lists())
open("Convention-Visualization-Empath.html", 'wb').write(html.encode('utf-8'))
IFrame(src="Convention-Visualization-Empath.html", width = 900, height=800)

## Analisi basata sulle emoji

Utilizzo un insieme di tweet per analizzare le emozioni attraverso le emoji presenti nel post

In [12]:
from scattertext.termranking import OncePerDocFrequencyRanker
import nltk, urllib.request, io, agefromname, zipfile
import warnings
warnings.filterwarnings("ignore")

In [13]:
with zipfile.ZipFile(io.BytesIO(urllib.request.urlopen(
    'http://followthehashtag.com/content/uploads/USA-Geolocated-tweets-free-dataset-Followthehashtag.zip'
).read())) as zf:
    df = pd.read_excel(zf.open('dashboard_x_usa_x_filter_nativeretweets.xlsx'))

nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
df['parse'] = df['Tweet content'].apply(nlp)

df.iloc[0]

Tweet Id                                                     721318437075685382
Date                                                                 2016-04-16
Hour                                                                      12:44
User Name                                                        Bill Schulhoff
Nickname                                                          BillSchulhoff
Bio                           Husband,Dad,GrandDad,Ordained Minister, Umpire...
Tweet content                 Wind 3.2 mph NNE. Barometer 30.20 in, Rising s...
Favs                                                                        NaN
RTs                                                                         NaN
Latitude                                                                40.7603
Longitude                                                              -72.9547
Country                                                                      US
Place (as appears on Bio)               

Il codice seguente usa il package AgeFromName per trovare la probabilità che ogni utente sia maschio o femmina, sulla base del nome

In [None]:
male_prob = agefromname.AgeFromName().get_all_name_male_prob()
male_prob.iloc[0]

Ora consideriamo solo gli utenti che sappiamo classificare con una probabilità almeno del 90% come maschi o femmine

In [15]:
df['first_name'] = df['User Name'].apply(lambda x: x.split()[0].lower() if type(x) == str and len(x.split()) > 0 else x)
df_aug = pd.merge(df, male_prob, left_on='first_name', right_index=True)
df_aug['gender'] = df_aug['prob'].apply(lambda x: 'm' if x > 0.9 else 'f' if x < 0.1 else '?')
df_mf = df_aug[df_aug['gender'].isin(['m', 'f'])]

Poi costruiamo un corpus di sole emoji

In [16]:
corpus = st.CorpusFromParsedDocuments(
    df_mf,
    parsed_col='parse',
    category_col='gender',
    feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

In [17]:
html = st.produce_scattertext_explorer(
    corpus,
    category='f',
    category_name='Female',
    not_category_name='Male',
    use_full_doc=True,
    term_ranker=OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=(df_mf['User Name']
              + ' (@' + df_mf['Nickname'] + ') '
              + df_mf['Date'].astype(str)),
    width_in_pixels=1000
)
open("EmojiGender.html", 'wb').write(html.encode('utf-8'))
IFrame(src="EmojiGender.html", width = 900, height=800)

Fonte: https://github.com/JasonKessler/scattertext