In [1]:
# Import cleaned and combined CSV data as pandas dataframe
# Change paths and file for your situation
import warnings # remove if you want
warnings.filterwarnings('ignore')
from os import listdir
from os.path import join
import csv
import pandas as pd

encoded_data_path = 'csv'

__encoded_all_file__ = 'full_corpus_clean_urls.csv'

df_all = pd.read_csv(
    join(encoded_data_path, __encoded_all_file__),
    delimiter=','
)

# check output, if you want
# df_all[:1]

In [2]:
df_all.columns

Index(['cleaned_urls', 'conversation_id', 'created_at', 'date', 'hashtags',
       'id', 'likes_count', 'link', 'location', 'mentions', 'name', 'photos',
       'place', 'quote_url', 'replies_count', 'retweet', 'retweets_count',
       'time', 'timezone', 'tweet', 'urls', 'user_id', 'username', 'video'],
      dtype='object')

In [3]:
# Example method to create date and group metadata for search queries
import nttc

# Define period ranges
ranges = [
    ('1', ['2018-01-01', '2018-03-30']),
    ('2', ['2018-04-01', '2018-06-12']),
    ('3', ['2018-06-13', '2018-07-28']),
    ('4', ['2018-07-29', '2018-10-17']),
    ('5', ['2018-10-18', '2018-11-24']),
    ('6', ['2018-11-25', '2018-12-10']),
    ('7', ['2018-12-11', '2018-12-19']),
    ('8', ['2018-12-20', '2018-12-25']),
    ('9', ['2018-12-26', '2019-02-13']),
    ('10', ['2019-02-14', '2019-02-28'])
]

period_dates = nttc.period_dates_writer(ranges=ranges)

# HASHTAG GROUPINGS
btw_list = [
    '#bordercrisis', '#bordersecurity', '#buildthewall',
    '#caravaninvasion', '#illegals',  '#migrantcaravan',
    '#nationalemergency', '#ronilsingh'
]

fbt_list = [
    '#familyseparation', '#familiesbelongtogether',
    '#felipegomez', '#keepfamiliestogether',
    '#maquin', '#wherearethechildren',
    'jakelin caal', 'maquín', 'maquin', 'felipe alonzo-gomez', 
    'felipe gomez'
]
anti_list = [
    '#shutdownstories','#trumpshutdown','#noborderwall'
]

period_dates['1'][:5]

['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05']

In [4]:
# Using nttc, import infomap commumity-detected module files in folders in batches,
# then isolate users per period per module

# 1. Retrieve directory of .ftree files and save each line of the file within a list of lists to per Period Dict
fbt_map_path = '../data/ht_groups/fbt/period_infomap/ftree'
btw_map_path = '../data/ht_groups/btw/period_infomap/ftree'
anti_map_path = '../data/ht_groups/anti/period_infomap/ftree'

fbt_dict_map = nttc.batch_map(regex=r"\d{1,2}", path=fbt_map_path, file_type='ftree')
btw_dict_map = nttc.batch_map(regex=r"\d{1,2}", path=btw_map_path, file_type='ftree')
anti_dict_map = nttc.batch_map(regex=r"\d{1,2}", path=anti_map_path, file_type='ftree')

print(
    '1.\nIndices: ',
    '\n\nFirst 3 lines in file: ',
    fbt_dict_map['1']['lines'][:3],
    '\n\n'
)

# 2. Take full listified .ftree file and write per Period per Module hubs as a Dict
fbt_new_dict = fbt_dict_map
fbt_dh = nttc.infomap_hub_maker(fbt_new_dict, file_type='ftree', mod_sample_size=10, hub_sample_size=-1)

btw_new_dict = btw_dict_map
btw_dh = nttc.infomap_hub_maker(btw_new_dict, file_type='ftree', mod_sample_size=10, hub_sample_size=-1)

anti_new_dict = anti_dict_map
anti_dh = nttc.infomap_hub_maker(anti_new_dict, file_type='ftree', mod_sample_size=10, hub_sample_size=-1)

print(
    '2.\nSample hub:\n',
    fbt_dh['10']['info_hub']['1'][:3],
    '\n\n',
    btw_dh['10']['info_hub']['1'][:3],
    '\n\n',
    anti_dh['10']['info_hub']['1'][:3]
)

1.
Indices:  

First 3 lines in file:  ["# '../data/ht_groups/fbt/period_edges/edges/net/edgelists/p1_net.net ../data/ht_groups/fbt/period_infomap/ -N 10 --directed --two-level --clu --map --ftree' -> 5 nodes and 3 links partitioned in 0s from codelength 1.584962501 in one level to codelength 0.666666667 in 2 levels.", '# path flow name node:', '1:1 0.333333 "potus" 4'] 


2.
Sample hub:
 [{'module': '1', 'node': '1', 'name': 'usatoday', 'score': 0.206825}, {'module': '1', 'node': '2', 'name': 'gop', 'score': 0.0970157}, {'module': '1', 'node': '3', 'name': 'bvweir', 'score': 0.0}] 

 [{'module': '1', 'node': '1', 'name': 'realdonaldtrump', 'score': 0.374579}, {'module': '1', 'node': '2', 'name': 'betoorourke', 'score': 0.0112137}, {'module': '1', 'node': '3', 'name': 'lindseygrahamsc', 'score': 0.00078067}] 

 [{'module': '1', 'node': '1', 'name': 'docrocktex26', 'score': 0.0755079}, {'module': '1', 'node': '2', 'name': 'mcspocky', 'score': 0.0755079}, {'module': '1', 'node': '3', 'na

In [5]:
# Create dict of lists per hub with hub usernames
# RQ: What are the top X urls shared among comms per period?
def hub_name_lister(dh):
    hub_lists = {}
    for htg in dh:
        hub_lists.update({htg[0]:{}})
        for p in htg[1]:                
            if 'info_hub' in htg[1][p]:
                hub_lists[htg[0]].update({p: {}})
                for ih in htg[1][p]['info_hub']:
                    hub_lists[htg[0]][p].update({ih: [ ]})
                    for h in htg[1][p]['info_hub'][ih]:
                        hub_lists[htg[0]][p][ih].append( h['name'] )
    return hub_lists

In [6]:
hub_lists = hub_name_lister( [ ('btw',btw_dh), ('anti',anti_dh), ('fbt',fbt_dh) ] )

In [7]:
hub_lists['anti']['1']['1']

['senschumer',
 'nancypelosi',
 'hurdonthehill',
 'centerforbiodiv',
 'vp',
 '_political_p',
 'ryandbeam',
 'kimamabarbara',
 'cantabro',
 'emttrish1331',
 'lifieldgg',
 'northtexasdream',
 'jmikeseifert',
 'sunshinejoanie']

In [8]:
#Write hashtags as regex (| | |) patterns
import urlcounter as urlc
htg_btw = urlc.regex_lister(btw_list,'btw')
htg_fbt = urlc.regex_lister(fbt_list,'fbt')
htg_anti = urlc.regex_lister(anti_list,'anti')

htg_fbt

('fbt',
 '(#familyseparation|#familiesbelongtogether|#felipegomez|#keepfamiliestogether|#maquin|#wherearethechildren|jakelin caal|maquín|maquin|felipe alonzo-gomez|felipe gomez)')

In [9]:
# Run top_urls()
cdf = df_all.copy()
dict_url_counts = urlc.top_urls(
    df=cdf,
    periods=(1,2),
    hubs=(1,2),
    period_dates=period_dates, 
    list_of_regex=[htg_anti,htg_btw,htg_fbt],
    hl=hub_lists,
    columns=['cleaned_urls', 'retweets_count', 'hashtags', 'username', 'mentions'],
    url_sample_size=50,
    verbose=True # True prints out status messages, False prints nothing
)


Period 1 :  ('anti', '(#shutdownstories|#trumpshutdown|#noborderwall)') : 330
Period 1 Hub 1 Size: 14
Period 1 Hub 1 # of Tweets: 20
Period 1 Hub 1 URLS:
 [('https://noborderwalls.org/', 3), ('https://drive.google.com/file/d/11ZqYEEE7vbSJWmKBgKTXbITzJ5MjmOOA/view', 1), ('https://www.politico.com/story/2018/01/23/chuck-schumer-trump-wall-offer-359156', 1)]
Period 1 Hub 1 DOMAINS:
 [('noborderwalls.org', 3), ('drive.google.com', 1), ('politico.com', 1)]
Period 1 Hub 2 Size: 18
Period 1 Hub 2 # of Tweets: 34
Period 1 Hub 2 URLS:
 [('https://thinkprogress.org/congress-waive-environmental-laws-for-trumps-border-wall-2899b1200b20/', 38), ('https://www.nytimes.com/interactive/2018/01/19/us/politics/government-shutdown-employee-effects.html', 29), ('http://www.bbc.com/news/world-us-canada-43418898', 3)]
Period 1 Hub 2 DOMAINS:
 [('thinkprogress.org', 38), ('nytimes.com', 29), ('twitter.com', 4)]

Period 2 :  ('anti', '(#shutdownstories|#trumpshutdown|#noborderwall)') : 93
Period 2 Hub 1 Size:

In [10]:
dict_url_counts['1']['anti']

{'1': {'hub_domain_counts': [('noborderwalls.org', 3),
   ('drive.google.com', 1),
   ('politico.com', 1)],
  'hub_sample_size': 14,
  'hub_tweet_sample_size': 20,
  'hub_url_counts': [('https://noborderwalls.org/', 3),
   ('https://drive.google.com/file/d/11ZqYEEE7vbSJWmKBgKTXbITzJ5MjmOOA/view',
    1),
   ('https://www.politico.com/story/2018/01/23/chuck-schumer-trump-wall-offer-359156',
    1)]},
 '2': {'hub_domain_counts': [('thinkprogress.org', 38),
   ('nytimes.com', 29),
   ('twitter.com', 4),
   ('bbc.com', 3)],
  'hub_sample_size': 18,
  'hub_tweet_sample_size': 34,
  'hub_url_counts': [('https://thinkprogress.org/congress-waive-environmental-laws-for-trumps-border-wall-2899b1200b20/',
    38),
   ('https://www.nytimes.com/interactive/2018/01/19/us/politics/government-shutdown-employee-effects.html',
    29),
   ('http://www.bbc.com/news/world-us-canada-43418898', 3),
   ('https://twitter.com/washingtonpost/status/966807193713610752', 1),
   ('https://twitter.com/Earthjustice/