In [1]:
import pandas as pd
from collections import Counter
import pprint
import time
import sys
import csv

In [2]:
def printProgress (iteration, total, prefix = '', suffix = '', decimals = 1, barLength = 50):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        barLength   - Optional  : character length of bar (Int)
    """
    formatStr = "{0:." + str(decimals) + "f}"
    percent = formatStr.format(100 * (iteration / float(total)))
    filledLength = int(round(barLength * iteration / float(total)))
    bar = '█' * filledLength + '-' * (barLength - filledLength)
    sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, bar, percent, '%', suffix)),
    if iteration == total:
        sys.stdout.write('\n')
    sys.stdout.flush()

In [3]:
def pairs_count(df, col, group_by):
    df = df[[group_by, col]].sort_values(by=[group_by, col]).drop_duplicates().copy()
    it = df.itertuples()
    pair_finder = Counter()
    counter = 1
    total = len(df)
    
    for row in it:
        printProgress(counter, total, prefix='Start', suffix='Complete')
        _, group_by, col = row
        inner_it = df.iloc[counter:,:].itertuples()
        
        for other_row in inner_it:
            _, other_group_by, other_col = other_row
            if group_by != other_group_by:
                break
            elif col == other_col:
                continue
            else:
                topic_pair = str(col) + '%-%' + str(other_col)
                pair_finder.update([topic_pair])

        counter += 1

    return pair_finder


In [4]:
df = pd.read_csv('/Users/jaustinj/repository/network_mapper/data/draft2_topics.csv')
len(df)

333055

In [5]:

#only keep most relevant topics
topic_weights = pd.DataFrame(df.groupby('topic').author_id.nunique())
print('Initial number of topics: %s' % (len(topic_weights)))

topic_weights = topic_weights[topic_weights['author_id']>0]
print('Topics after filtering: %s' % (len(topic_weights)))

relevant_topics = list(topic_weights.index)
print('Initial Activity records: %s' % (len(df)))

df = df[df['topic'].isin(relevant_topics)]
print('Activity records after filtering: %s' % (len(df)))

Initial number of topics: 41956
Topics after filtering: 41956
Initial Activity records: 333055
Activity records after filtering: 333052


In [None]:
start = time.time()
pairs = pairs_count(df, col='topic', group_by='video_id')
completion = time.time() - start
print(completion)

Start |█████████-----------------------------------------| 17.2% Complete

In [None]:
len(pairs)

In [12]:
with open('/Users/jaustinj/repository/network_mapper/data/topic_output.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Topic1', 'Topic2', 'Count']) #write header
    for key, count in pairs.items():
        topic1, topic2 = key.split('%-%')
        writer.writerow([topic1, topic2, count])

In [16]:
pairs_df = pd.read_csv('/Users/jaustinj/repository/network_mapper/data/topic_output.csv')
len(pairs_df)

In [17]:
topic_connections = Counter(pairs_df['Topic1'].tolist()+pairs_df['Topic2'].tolist())
tc = pd.DataFrame([{'topic': topic, 'count': count} for topic, count in topic_connections.items()])
len(tc)

41956

In [20]:
tc = tc.sort_values('count', ascending=False).reset_index().iloc[:,1:]
tc.head(10)

Unnamed: 0,count,topic
0,34134,US Marines
1,29547,United States Navy
2,26818,Military
3,26370,United States Navy SEALs
4,24424,Soldier
5,23976,United States Air Force
6,23483,Recruit training
7,23237,United States Army
8,22009,United States Marine Corps Recruit Training
9,21735,Drill instructor


In [19]:
tc.to_csv('/Users/jaustinj/repository/network_mapper/data/topic_connections.csv')

In [28]:
topic_weights = topic_weights.sort_values(by='author_id',ascending=False)
topic_weights.to_csv('/Users/jaustinj/repository/network_mapper/data/node_weights.csv')