# Import data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pip install swifter

In [None]:
pip install "pytextrank < 3"

In [2]:
import gdown
import zipfile

import pandas as pd 
import scattertext as st
import swifter
import spacy 
import pytextrank
import numpy as np 
import codecs
from bs4 import BeautifulSoup


## Download from the cloud (Optional)

If you didn't run preprocessing.ipynb, run this script to get the processed_train_data.csv from Google Drive

In [7]:
url = 'https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3'
output = 'Twitter.zip'
gdown.download(url, output, quiet=False) 


# Extract the zip file. The data is saved under Data directory
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall('.')

Downloading...
From: https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3
To: /home/user/Data-science/visualization/scattertext/Twitter.zip
120MB [00:09, 12.1MB/s] 


## Read data

In [19]:
df = pd.read_pickle('Data/processed_train_data.pkl')

# Visualize data

In [4]:
df.tweets = df.tweets.swifter.apply(lambda text: ' '.join(text))

Pandas Apply:   0%|          | 0/3600 [00:00<?, ?it/s]

In [None]:
# Create the parse feature to feed the class st.CorpusFromParsedDocuments later 
df['parse'] = df.tweets.swifter.apply(st.whitespace_nlp_with_sentences)

The step above can take a long time, if you prefer not to wait, interrupt the run and run the code below instead

In [6]:
df = pd.read_pickle('Data/processed_train_data_with_parse.pkl')

In [7]:
df.head(10)

Unnamed: 0,tweets,gender,country,parse
0,fucking terrify nemesis record bad enemy know ...,male,canada,"(fucking, terrify, nemesis, record, bad, enemy..."
1,poetry shelf summer season poet pick poem pick...,female,new zealand,"(poetry, shelf, summer, season, poet, pick, po..."
2,say truth attest cloud sky joke how okay s coo...,female,canada,"(say, truth, attest, cloud, sky, joke, how, ok..."
3,seem right pretty accurate new breakfast wrap ...,female,canada,"(seem, right, pretty, accurate, new, breakfast..."
4,s word touch early leave night morning work ba...,male,ireland,"(s, word, touch, early, leave, night, morning,..."
5,anticipation kill wait hope climb fall brillia...,female,great britain,"(anticipation, kill, wait, hope, climb, fall, ..."
6,far sit greeting eye lilo stitch emosh day wee...,female,great britain,"(far, sit, greeting, eye, lilo, stitch, emosh,..."
7,fantastic list resource beautiful social mediu...,female,great britain,"(fantastic, list, resource, beautiful, social,..."
8,thank station gordon min record spin wait air ...,male,ireland,"(thank, station, gordon, min, record, spin, wa..."
9,work hope let film tomorrow biscuit home morni...,female,great britain,"(work, hope, let, film, tomorrow, biscuit, hom..."


In [8]:
corpus = st.CorpusFromParsedDocuments(df, category_col='gender', parsed_col='parse'
                                     ).build().get_unigram_corpus()

In [9]:
# Reduce the number of phrases displayed in the chart to 2000 
corpus = corpus.compact(st.AssociationCompactor(2000))

- pmi_threshold_coefficient: Pointwise mutual information. 0 if two words are independent

In [10]:
html = st.produce_scattertext_explorer(
    corpus,
    category='female', category_name='Female', not_category_name='Male',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['country'],
    transform=st.Scalers.dense_rank
    )

In [11]:
open('./tweets_gender.html', 'w').write(html)


11880362

# Visualizing Phrase associations


Download [PyTextRank](https://github.com/DerwenAI/pytextrank) before running the code below.
```bash
pip install pytextrank
```

In [12]:
nlp = spacy.load('en_core_web_sm')

df = df.assign(parse=lambda data: data.tweets.apply(nlp))

In [13]:
corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='gender',
    parsed_col='parse',
    feats_from_spacy_doc=st.PyTextRankPhrases()
).build(
).compact(
    st.AssociationCompactor(2000, use_non_text_features=True)
)

In [14]:
term_category_scores = corpus.get_metadata_freq_df('')
term_category_scores.head(10)

Unnamed: 0_level_0,male,female
term,Unnamed: 1_level_1,Unnamed: 2_level_1
fact,3.932811,3.090892
money,4.988702,5.294535
french,3.018954,2.895008
british,3.380062,2.216202
-,3.474278,2.653272
reminder,0.256091,0.267291
paper,0.304292,0.31442
poem,0.049069,0.265253
year week,0.140695,0.304045
summer day,0.361183,0.621384


In [64]:
# Get the rank of each term in each category
term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1

# Text displayed when a term is clicked
metadata_descriptions = {
    term: '<br/>' + '<br/>'.join(
        '<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
        for cat in corpus.get_categories())
    for term in corpus.get_metadata()
}

In [83]:
category_specific_prominence = term_category_scores.apply(
    lambda r: r.female if r.female > r.male else -r.male,
    axis=1
)

In [84]:
category_specific_prominence

term
change                         -1.664847
jewish                         -1.611201
canadian                        5.083046
democratic                     -1.200413
british                        -3.243414
                                  ...   
late carman                     8.516214
welcome app thank connect      -5.051565
why follow look website info   -3.450510
more person                     0.708264
many thank follow guy          -0.758160
Length: 2000, dtype: float64

In [79]:
html = st.produce_scattertext_explorer(
    corpus,
    category='female',
    not_category_name='male',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    transform=st.dense_rank,
    metadata=corpus.get_df()['country'],
    scores=category_specific_prominence,
    sort_by_dist=False,
    use_non_text_features=True,
    topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
    topic_model_preview_size=0,
    metadata_descriptions=metadata_descriptions,
    use_full_doc=True
)

In [80]:
open('./tweets_gender_textrank.html', 'w').write(html)

13521834