# Sample tweets based on community module or hashtag groupings

If you are using .ftree files, then follow the below imoprting and process. 

If you already have edge data, then proceed to importing your tweet corpus.

## Import .ftree files and process into respective network edge and node data per module

In [None]:
import nttc

# 1. Retrieve directory of .ftree files and save each line of the file within a list of lists to per Period Dict
ftree_path = '../infomap/output/nets/ftree/ftree'

# regex is the file pattern in a dedicated directory, e.g., 
# # r"\d{1,2}" will match the '1' in p1_ftree.ftree
dict_map = nttc.batch_map(regex=r"\d{1,2}", path=ftree_path, file_type='ftree')

# Print sample ftree modules
print(
    '1.\nIndices: ',
    dict_map['1']['indices']['ftree_modules'],
    '\n\nFirst 5 file lines of module section: ',
    dict_map['1']['lines'][dict_map['1']['indices']['ftree_modules'][0]:5],
    '\n\n'
)

# Print sample ftree links
five = dict_map['1']['indices']['ftree_links']['1']['indices'][0]+5
print(
    '1.\nIndices for module 1 links: ',
    dict_map['1']['indices']['ftree_links']['1']['indices'],
    '\n\nFirst 5 lines of period 1, module 1 links section: ',
    dict_map['1']['lines'][dict_map['1']['indices']['ftree_links']['1']['indices'][0]:five],
    '\n\n'
)

In [None]:
# Check output
dict_map['1']['indices']['ftree_links']['1']

In [None]:
copy_dict_map = dict_map
# Process each period's module edge data and stores as a DataFrame.
dict_with_edges = nttc.ftree_edge_maker(copy_dict_map)

In [None]:
dict_with_edges['1']['indices']['ftree_links']['1']['df_edges'][:5]

In [None]:
# Take full listified .ftree file and write per Period per Module hubs as a Dict
new_dict = dict_with_edges
dh = nttc.infomap_hub_maker(new_dict, file_type='ftree', mod_sample_size=10, hub_sample_size=-1)
print(
    '2.\nSample hub: ',
    dh['1']['info_hub']['1'][:5]
)

In [None]:
# Write edge and node lists per module: 
## (num of periods, num of modules, Dict of module data from infomap_hub_maker)
dict_full = nttc.networks_controller(10,10,dh)

In [None]:
# Test outputs
dict_full['network']['10']['1']['edges'][:5]

In [None]:
dict_full['network']['10']['1']['nodes'][:5]

# CREATE SAMPLED OUTPUTS

## Import tweet corpus

In [None]:
# Import cleaned and combined CSV data as pandas dataframe
from os import listdir
from os.path import join
import csv
import pandas as pd

data_path = '../collection/twint/full-combined'
encoded_data_path = '../data/encoded'
csv_header = 'id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,mentions,urls,photos,replies_count,retweets_count,likes_count,location,hashtags,link,retweet,quote_url,video'

dtype_dict={
    'id': str,
    'conversation_id': str,
    'username': str,
    'user_id': str,
    'mentions': str,
    'tweet': str,
    'hashtags': str,
    'link': str,
    'user_rt': str,
}

__encoded_all_file__ = 'cleaned-all-combined.csv'

df_all = pd.read_csv(join(encoded_data_path, __encoded_all_file__), 
                     delimiter=',',
                     dtype=dtype_dict)

df_all.describe()

In [None]:
# Remove "Unnamed" column
del df_all['Unnamed: 0']
df_all[:2]

In [None]:
df_selected = df_all[['id', 'date', 'user_id', 'username', 'tweet', 'mentions', 'retweets_count', 'hashtags', 'link']]
df_selected[:2]

## Create desired metadata as per your project: period dates and hashtag groups

In [None]:
# PERIOD DATES
ranges = [
    ('1', ['2018-01-01', '2018-03-30']),
    ('2', ['2018-04-01', '2018-06-12']),
    ('3', ['2018-06-13', '2018-07-28']),
    ('4', ['2018-07-29', '2018-10-17']),
    ('5', ['2018-10-18', '2018-11-24']),
    ('6', ['2018-11-25', '2018-12-10']),
    ('7', ['2018-12-11', '2018-12-19']),
    ('8', ['2018-12-20', '2018-12-25']),
    ('9', ['2018-12-26', '2019-02-13']),
    ('10', ['2019-02-14', '2019-02-28'])
]

period_dates = nttc.period_dates_writer(ranges=ranges)

# HASHTAG GROUPINGS
conservative_hashtag_list = [
    '#bordercrisis', '#bordersecurity', '#buildthewall',
    '#caravaninvasion', '#illegals',  '#migrantcaravan',
    '#nationalemergency', '#ronilsingh'
]

liberal_keyword_list = [ 
    {
        '#felipegomez': ['felipe alonzo-gomez', 'felipe gomez']
    },
    {
        '#maquin': ['jakelin caal', 'maquín', 'maquin' ]
    }
]
liberal_fbt_list = [
    '#familyseparation', '#familiesbelongtogether',
    '#felipegomez', '#keepfamiliestogether',
    '#maquin', '#noborderwall', '#wherearethechildren',
    'jakelin caal', 'maquín', 'maquin', 'felipe alonzo-gomez', 
    'felipe gomez'
]
liberal_antishutdown_list = [
    '#shutdownstories','#trumpshutdown'
]

period_dates['1'][:10]

### Sample hashtag groups

In [None]:
## Sample 1) certain periods, 2) certain hshtags
dict_samples = nttc.infomap_content_sampler(
                    dict_full['network'], 
                    sample_size=50,
                    period_dates=period_dates,
                    corpus=df_selected,
                    sample_type='hashtag_group',
                    ht_group=liberal_fbt_list,
                    user_threshold=5,
                    random=False)

In [None]:
dict_samples['2']['3']

### Sample modules

In [None]:
## Sample 1) certain periods, 2) certain hshtags
dict_samples = nttc.infomap_content_sampler(
                    dict_full['network'], 
                    sample_size=50,
                    period_dates=period_dates,
                    corpus=df_selected,
                    sample_type='modules',
                    ht_group=None,
                    user_threshold=5,
                    random=False)

In [None]:
dict_samples['10']['1']

## Batch outputs per module

In [None]:
import pandas as pd
from os import listdir
from os.path import join
import csv

lister = []
for p in dict_samples:
    for m in dict_samples[p]:
        sub_list = []
        print(p,m)
        try:
            records = dict_samples[p][m]['sample'].to_dict('records')
            for r in records:
                r['period'] = p
                r['module'] = m
                lister.append(r)
        except AttributeError as e:
            print(e)
lister[:5]

In [None]:
df_full_samples = pd.DataFrame.from_dict(lister)
df_full_samples[:5]

In [None]:
# Drop duplicates
cleaned_df_full_samples = df_full_samples.drop_duplicates(subset=['id'], keep='first')
print(len(cleaned_df_full_samples), len(df_full_samples))

In [None]:
cdf = cleaned_df_full_samples[['period','module','username','tweet','retweets_count','hashtags','link','mentions','date','id','user_id']]
cdf[:2]

In [None]:
cdf.to_csv(join('../infomap/output/nets/ftree/csv', 'ftree_fbt_hashtag_groups_tweet_sample.csv'),
                                sep=',',
                                encoding='utf-8',
                                index=False)

## OUTPUT CSV OF EDGE DATA WITH USERNAMES

In [None]:
# OUTPUT EDGES
lister_edges = []
for p in dict_samples:
    for m in dict_samples[p]:
        sub_list = []
        try:
            records = dict_full['network'][p][m]['edges'].to_dict('records')
            for r in records:
                r['period'] = p
                r['module'] = m
                lister_edges.append(r)
        except AttributeError as e:
            print(e)
lister_edges[:5]

In [None]:
len(lister_edges)

In [None]:
df_full_edges = pd.DataFrame.from_dict(lister_edges)
df_full_edges[:5]

In [None]:
df_full_edges.to_csv(join('../infomap/output/nets/ftree/csv', 'infomap_edges_with_names_all_periods.csv'),
                                sep=',',
                                encoding='utf-8',
                                index=False)

In [None]:
df_full_samples.to_csv(join('../infomap/output/nets/ftree/csv', 'infomap_tweet_sample_all_periods.csv'),
                                sep=',',
                                encoding='utf-8',
                                index=False)