# Count hashtags in corpus
In which we count the frequency of hashtags from the referendum list in our corpus.

In [9]:
import pandas as pd
import json
import re
import codecs
from collections import Counter

## Load data

In [5]:
hashtag_df = pd.read_csv('../../data/expanded_hashtags.csv', index_col=False)
expanded_hashtag_df = pd.read_csv('../../data/expanded_fixed_hashtags.csv', index_col=False)
# join
hashtag_df = pd.merge(hashtag_df, expanded_hashtag_df, left_on='hashtag', right_on='original').drop(['s', 'original', 'comments'], inplace=False, axis=1)
print(hashtag_df.head())

                 hashtag sentiment               expanded
0  DemocraciaMarcaEspaña       pro  democraciamarcaespaña
1  DemocraciaMarcaEspaña       pro  democraciamarcaespana
2  DemocraciaMarcaEspaña       pro  DemocraciaMarcaEspana
3  DemocraciaMarcaEspaña       pro  DemocraciaMarcaEspaña
4          CataluñaLibre       pro          CatalunaLibre


In [8]:
hashtag_matcher = re.compile('(?<=\#)[^ \.\#]+')
test = 'blah #democracia#yes blah'
print(hashtag_matcher.findall(test))

['democracia', 'yes']


In [13]:
# collect hashtags
ct_data_file = '../../data/tweets/archive_Jan-01-17_Oct-31-17_ref_hashtags.json'
hashtag_counter = Counter()
for l in codecs.open(ct_data_file, 'r', encoding='utf-8'):
    try:
        j = json.loads(l.strip())
        j_txt = j['text']
        j_hashtags = hashtag_matcher.findall(j_txt)
        hashtag_counter.update(j_hashtags)
    except Exception, e:
        print('problem with tweet %s'%(l.strip()))

problem with tweet {"created_at":"Sun Oct 01 08:07:26 +0000 2017","id":914401355921534976,"id_str":"914401355921534976","text":"Da lo que tienes para que merezcas recibir lo que te falta #CatalanReferendum https:\/\/t.co\/6QGsRumNi4","display_text_range":[0,77],"source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2897985604,"id_str":"2897985604","name":"Giulio Toscani","screen_name":"giuliotoscani","location":"Barcelona, Catalu\u00f1a","url":null,"description":"Education is the path from cocky ignorance to miserable uncertainty.","translator_type":"none","


In [14]:
print(hashtag_counter.most_common(10))

[(u'CatalanReferendum', 24614), (u'1Oct', 7237), (u'1O', 5265), (u'1of1', 3031), (u'1OCT', 2756), (u'referendum', 2496), (u'1oct', 2447), (u'Catalonia', 2264), (u'referendumCAT', 1992), (u'votarem', 1815)]


Hashtag counts make sense! Let's use these for our "full" counts and then compute the actual counts from the smaller filtered dataset.

In [17]:
hashtag_count_series = pd.Series(hashtag_counter)
# filter
overlap_hashtags = set(hashtag_count_series.index) & set(hashtag_df.loc[:, 'expanded'])
hashtag_count_series = hashtag_count_series.loc[overlap_hashtags].sort_values(inplace=False, ascending=False)
# collapse
hashtag_counts_collapsed = {}
for original_hashtag, hashtag_group in hashtag_df.groupby('hashtag'):
    o_overlap_expanded = set(hashtag_group.loc[:, 'expanded']) & set(hashtag_count_series.index)
    if(len(o_overlap_expanded) > 0):
        hashtag_counts_collapsed[original_hashtag] = hashtag_count_series.loc[o_overlap_expanded].sum()
hashtag_counts_collapsed = pd.Series(hashtag_counts_collapsed).sort_values(inplace=False, ascending=False)
print(hashtag_counts_collapsed)

CatalanReferendum           25603
1Oct                         9684
1O                           6174
Votarem                      3592
referéndum                   2496
referendumCAT                2074
TotsSomCatalunya             1810
FreePiolin                   1797
1octL6                        942
ReferendumCatalan             910
1Oct2017                      731
ProuPuigdemont                591
RepúblicaCatalana             566
referendum1deoctubre          456
CatalanRef2017                400
Ref1oct                       294
marxem                        274
CatalanReferendum2017         273
JoVoto                        212
OrgullososDeSerEspañoles      205
1Oct2017votarem               167
CataluñaEsEspaña              167
ref1oct                       125
EspanaNoSeRompe                69
CataluñaLibre                  68
1ONoTincPor                    55
DontDUIt                       41
LetCatalansVote                27
SpanishDictatorship            26
CataloniaIsNot

Same thing with the filtered data.

In [18]:
def get_hashtag_counts(f, hashtag_df):
    hashtag_counter = Counter()
    for l in codecs.open(f, 'r', encoding='utf-8'):
        try:
            j = json.loads(l.strip())
            j_txt = j['text']
            j_hashtags = hashtag_matcher.findall(j_txt)
            hashtag_counter.update(j_hashtags)
        except Exception, e:
            print('problem with tweet %s'%(l.strip()))
    hashtag_count_series = pd.Series(hashtag_counter)
    # filter
    overlap_hashtags = set(hashtag_count_series.index) & set(hashtag_df.loc[:, 'expanded'])
    hashtag_count_series = hashtag_count_series.loc[overlap_hashtags].sort_values(inplace=False, ascending=False)
    # collapse
    hashtag_counts_collapsed = {}
    for original_hashtag, hashtag_group in hashtag_df.groupby('hashtag'):
        o_overlap_expanded = set(hashtag_group.loc[:, 'expanded']) & set(hashtag_count_series.index)
        if(len(o_overlap_expanded) > 0):
            hashtag_counts_collapsed[original_hashtag] = hashtag_count_series.loc[o_overlap_expanded].sum()
    hashtag_counts_collapsed = pd.Series(hashtag_counts_collapsed).sort_values(inplace=False, ascending=False)
    return hashtag_counts_collapsed

In [20]:
ct_data_filtered_file = '../../data/tweets/archive_Jan-01-17_Oct-31-17_ref_hashtags_filtered.json'
hashtag_counts = get_hashtag_counts(ct_data_filtered_file, hashtag_df)

In [21]:
print(hashtag_counts)

CatalanReferendum           3244
1Oct                        1351
Votarem                      954
1O                           748
referéndum                   640
referendumCAT                457
ReferendumCatalan            298
FreePiolin                   293
TotsSomCatalunya             261
RepúblicaCatalana            212
ProuPuigdemont               187
1octL6                       184
1Oct2017                     171
referendum1deoctubre         146
marxem                       102
CatalanReferendum2017         72
CataluñaEsEspaña              69
Ref1oct                       68
OrgullososDeSerEspañoles      55
JoVoto                        54
CatalanRef2017                46
EspanaNoSeRompe               29
1Oct2017votarem               28
CataluñaLibre                 27
ref1oct                       22
1ONoTincPor                   18
DontDUIt                      12
CataloniaIsNotSpain           10
IndependenciaCataluña          9
SpanishDictatorship            9
spainispai

In [31]:
# sort by pro/anti/neutral
for sent, group in hashtag_df.groupby('sentiment'):
    print(sent)
    s_hashtags = set(group.loc[:, 'hashtag']) & set(hashtag_counts.index)
    # format
    s_hashtag_counts = hashtag_counts.loc[s_hashtags]
    s_hashtag_counts = sorted(s_hashtag_counts.to_dict().items(), key=lambda x: x[0])
    hashtag_str = ','.join(map(lambda x: ' \#%s (%s)'%(x[0],x[1]), s_hashtag_counts))
#     print(hashtag_counts.loc[s_hashtags])
    print(hashtag_str)

anti
 \#CataloniaIsNotSpain (10), \#CataluñaEsEspaña (69), \#DontDUIt (12), \#EspanaNoSeRompe (29), \#EspañaUnida (4), \#OrgullososDeSerEspañoles (55), \#PorLaUnidadDeEspaña (2), \#ProuPuigdemont (187)
neut
 \#1O (748), \#1Oct (1351), \#1Oct2017 (171), \#1Oct2017votarem (28), \#CatalanRef2017 (46), \#CatalanReferendum (3244), \#CatalanReferendum2017 (72), \#JoVoto (54), \#Ref1oct (68), \#ReferendumCatalan (298), \#Votarem (954), \#ref1oct (22), \#referendum1deoctubre (146), \#referendumCAT (457), \#referéndum (640)
pro
 \#1ONoTincPor (18), \#1octL6 (184), \#CATvotaSí (3), \#CataluñaLibre (27), \#FreePiolin (293), \#Freedom4Catalonia (2), \#IndependenciaCataluña (9), \#LetCatalansVote (3), \#RepúblicaCatalana (212), \#SpanishDictatorship (9), \#SpanishRepression (3), \#TotsSomCatalunya (261), \#marxem (102), \#spainispain (8)
