# Import data

In [54]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import gdown
import zipfile

import pandas as pd 
import scattertext as st
import swifter
import spacy 
import pytextrank
import numpy as np 

## Download from the cloud (Optional)

If you didn't run preprocessing.ipynb, run this script to get the processed_train_data.csv from Google Drive

In [None]:
url = 'https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3'
output = 'Twitter.zip'
gdown.download(url, output, quiet=False) 


# Extract the zip file. The data is saved under Data directory
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall('.')

## Read data

In [37]:
df = pd.read_pickle('Data/processed_train_data.pkl')

# Visualize data

In [39]:
df.tweets = df.tweets.swifter.apply(lambda text: ' '.join(text))


Pandas Apply: 100%|██████████| 3600/3600 [00:00<00:00, 102960.69it/s]


In [None]:
# Create the parse feature to feed the class st.CorpusFromParsedDocuments later 
df['parse'] = df.tweets.swifter.apply(st.whitespace_nlp_with_sentences)

The step above can take a long time, if you prefer not to wait, interrupt the run and run the code below instead

In [45]:
df = pd.read_pickle('Data/processed_train_data_with_parse.pkl')

In [46]:
df.head(10)

Unnamed: 0,tweets,gender,country,parse
0,fucking terrify nemesis record bad enemy know ...,male,canada,"(fucking, terrify, nemesis, record, bad, enemy..."
1,poetry shelf summer season poet pick poem pick...,female,new zealand,"(poetry, shelf, summer, season, poet, pick, po..."
2,say truth attest cloud sky joke how okay s coo...,female,canada,"(say, truth, attest, cloud, sky, joke, how, ok..."
3,seem right pretty accurate new breakfast wrap ...,female,canada,"(seem, right, pretty, accurate, new, breakfast..."
4,s word touch early leave night morning work ba...,male,ireland,"(s, word, touch, early, leave, night, morning,..."
5,anticipation kill wait hope climb fall brillia...,female,great britain,"(anticipation, kill, wait, hope, climb, fall, ..."
6,far sit greeting eye lilo stitch emosh day wee...,female,great britain,"(far, sit, greeting, eye, lilo, stitch, emosh,..."
7,fantastic list resource beautiful social mediu...,female,great britain,"(fantastic, list, resource, beautiful, social,..."
8,thank station gordon min record spin wait air ...,male,ireland,"(thank, station, gordon, min, record, spin, wa..."
9,work hope let film tomorrow biscuit home morni...,female,great britain,"(work, hope, let, film, tomorrow, biscuit, hom..."


In [47]:
corpus = st.CorpusFromParsedDocuments(df, category_col='gender', parsed_col='parse'
                                     ).build().get_unigram_corpus()

In [48]:
# Reduce the number of phrases displayed in the chart to 2000 
corpus = corpus.compact(st.AssociationCompactor(2000))

- pmi_threshold_coefficient: Pointwise mutual information. 0 if two words are independent

In [49]:
html = st.produce_scattertext_explorer(
    corpus,
    category='female', category_name='Female', not_category_name='Male',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['country'],
    transform=st.Scalers.dense_rank
    )

In [50]:
open('./tweets_gender.html', 'w').write(html)


11870129

# Visualizing Phrase associations


Download [PyTextRank](https://github.com/DerwenAI/pytextrank) before running the code below.
```bash
pip install pytextrank
```

In [52]:
nlp = spacy.load('en')

df = df.assign(parse=lambda data: data.tweets.apply(nlp))

In [None]:
corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='gender',
    parsed_col='parse',
    feats_from_spacy_doc=st.PyTextRankPhrases()
).build(
).compact(
    st.AssociationCompactor(2000, use_non_text_features=True)
)

In [57]:
term_category_scores = corpus.get_metadata_freq_df('')
term_category_scores.head(10)

Unnamed: 0_level_0,male,female
term,Unnamed: 1_level_1,Unnamed: 2_level_1
change,1.664847,1.61762
jewish,1.611201,0.893873
canadian,4.828005,5.083046
democratic,1.200413,1.028026
british,3.243414,2.122479
fact,3.845346,2.889458
annoy,0.082487,0.234663
french,2.676859,2.466787
summer day,0.336981,0.545745
monthly,1.087747,1.601123


In [64]:
# Get the rank of each term in each category
term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1

# Text displayed when a term is clicked
metadata_descriptions = {
    term: '<br/>' + '<br/>'.join(
        '<b>%s</b> TextRank score rank: %s/%s' % (cat, term_ranks.loc[term, cat], corpus.get_num_metadata())
        for cat in corpus.get_categories())
    for term in corpus.get_metadata()
}

In [78]:
category_specific_prominence = term_category_scores.apply(
    lambda r: r.female if r.female > r.male else -r.male,
    axis=1
)

In [79]:
html = st.produce_scattertext_explorer(
    corpus,
    category='female',
    not_category_name='male',
    minimum_term_frequency=0,
    pmi_threshold_coefficient=0,
    width_in_pixels=1000,
    transform=st.dense_rank,
    metadata=corpus.get_df()['country'],
    scores=category_specific_prominence,
    sort_by_dist=False,
    use_non_text_features=True,
    topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
    topic_model_preview_size=0,
    metadata_descriptions=metadata_descriptions,
    use_full_doc=True
)

In [80]:
open('./tweets_gender_textrank.html', 'w').write(html)

13521834