# 1.4 Data converted
- Convert data to network nodes and edges for visualization in Gephi

### Data Structure:
```
collated_data: {
|   [page_name_1]: {
|   |   comments: {
|   |   |   [post_id_1]: {
|   |   |   |   [comment_id_1]: {
|   |   |   |   |   'Commenter': -
|   |   |   |   |   'Comment': -
|   |   |   |   |   'Comment_User_Tagging: -
|   |   |   |   |   'Comment_User_Tagging_Link: -
|   |   |   |   },
|   |   |   |   ...
|   |   |   },
|   |   |   [post_id_2]: {
|   |   |   |   ...
|   |   |   },
|   |   |   ...
|   |   },
|   |   posts: {
|   |   |   [post_id_1]: {
|   |   |   |   'Date': -
|   |   |   |   'Content': -
|   |   |   |   'Reactions': {
|   |   |   |   |   'Total_Count': -
|   |   |   |   |   'Likes': -
|   |   |   |   |   'Haha': -
|   |   |   |   |   'Love': -
|   |   |   |   |   'Wow': -
|   |   |   |   |   'Sigh': -
|   |   |   |   |   'Grr': -
|   |   |   |   },
|   |   |   |   'Url': -
|   |   |   },
|   |   |   [post_id_2]: {
|   |   |   |   ...
|   |   |   },
|   |   |   ...
|   |   }
|   },
|   [page_name_2]: {
|   |   ...
|   },
|   ...
}
```

In [42]:
# Declaration of FINAL Static variables
ROOT_DATA_FOLDER = './Data/' 
ROOT_GEPHI_DATA_FOLDER = './Gephi_Data/'

# Declaration of all pages
page_list = [
    'DollarsAndSense',
    'MortgageConsultancy',
    'Seedly',
    'WokeManSalary'
]

# Declaration of all pages data
collated_data = {
    'DollarsAndSense': {},
    'MortgageConsultancy': {},
    'Seedly': {},
    'WokeManSalary': {}
}

In [43]:
import csv
import traceback
import pandas as pd

In [44]:
def read_comments_file(filepath):
    data = {}
    
    with open(comments_filepath, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')     
        
        # Get header row. Usually the top of the csv file
        headers = next(csv_reader)
        
        # Retreive index of the respective headers
        post_id_index = headers.index('Post_ID')
        commenter_index = headers.index('User')
        comment_index = headers.index('Comment')
        comment_id_index = headers.index('Comment_ID')
        comment_user_tagging_index = headers.index('User_Tagging')
        comment_user_tagging_link_index = headers.index('User_Tagging_Link')
        
        # Retrieve content
        for row in csv_reader:
            try:
                # Subsequent appearance of same post_id
                data[row[post_id_index]][row[comment_id_index]] = {
                    'Commenter': row[commenter_index],
                    'Comment': row[comment_index],
                    'Comment_User_Tagging': row[comment_user_tagging_index],
                    'Comment_User_Tagging_Link': row[comment_user_tagging_link_index]
                }
            except:
                # First appearance os post_id
                data[row[post_id_index]] = {
                    row[comment_id_index]: {
                        'Commenter': row[commenter_index],
                        'Comment': row[comment_index],
                        'Comment_User_Tagging': row[comment_user_tagging_index],
                        'Comment_User_Tagging_Link': row[comment_user_tagging_link_index]
                    }
                }
                
    return data
    
def read_posts_file(filepath):
    data = {}
    
    with open(posts_filepath, mode='r', encoding='ISO-8859-1') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',') 
        
        # Get header row. Usually the top of the csv file
        headers = next(csv_reader)
        
        # Retreive index of the respective headers
        post_id_index = headers.index('post_id')
        date_index = headers.index('date')
        content_index = headers.index('text')
        total_reactions_index = headers.index('reactions')
        likes_index = headers.index('likes')
        haha_index = headers.index('ahah')
        love_index = headers.index('love')
        wow_index = headers.index('wow')
        sigh_index = headers.index('sigh')
        grrr_index = headers.index('grrr')
        url_index = headers.index('url')
        count = 0
        # Retrieve contents
        for row in csv_reader:
            data[row[post_id_index]] = {
                'Date': row[date_index],
                'Content': row[content_index],
                'Reactions': {
                    'Total_Count': row[total_reactions_index],
                    'Likes': row[likes_index],
                    'Haha': row[haha_index],
                    'Love': row[love_index],
                    'Wow': row[wow_index],
                    'Sigh': row[sigh_index],
                    'Grrr': row[grrr_index]
                },              
                'Url': row[url_index]
            }
            
    return data               

In [45]:
# Load all data into JSON format variable - collated_data
for page in page_list:
    comments_filepath = ROOT_DATA_FOLDER + page + '/' + page + '_comments.txt'
    posts_filepath = ROOT_DATA_FOLDER + page + '/' + page + '_posts.txt'
    
    collated_data[page]['comments'] = read_comments_file(comments_filepath)
    collated_data[page]['posts'] = read_posts_file(posts_filepath)

## 1.4.1 Extract User Tagging Data
- Collated of all pages
- Respectve Pages

In [46]:
user_taggings_count = []

for page in page_list:
    user_taggings = 0
    for post, comments in collated_data[page]['comments'].items():
        for comment_id, comment_content in comments.items():
            if comment_content['Comment_User_Tagging'] != '':
                user_taggings += 1
    user_taggings_count.append(user_taggings)
    
d = {
    'Page Name': page_list[::],
    'No. of User Taggings': user_taggings_count, 
}

d['Page Name'].append('Total')
d['No. of User Taggings'].append(sum(user_taggings_count))

df = pd.DataFrame(data=d)
df.style

Unnamed: 0,Page Name,No. of User Taggings
0,DollarsAndSense,241
1,MortgageConsultancy,0
2,Seedly,221
3,WokeManSalary,1019
4,Total,1481


In [47]:
# Total (User) Comments with user taggings = 1481
d = {'Commenter':[], 'Comment_User_Taggings':[], 'User_Profile_Links':[]}

for page in page_list:
    for post, comments in collated_data[page]['comments'].items():
        for comment_id, comment_content in comments.items():
            
            if comment_content['Comment_User_Tagging'] != '':
                d['Commenter'].append(comment_content['Commenter'])
                d['Comment_User_Taggings'].append(comment_content['Comment_User_Tagging'].split(', '))
                d['User_Profile_Links'].append(comment_content['Comment_User_Tagging_Link'].split(', '))

df = pd.DataFrame(data=d)
df.style

Unnamed: 0,Commenter,Comment_User_Taggings,User_Profile_Links
0,Baoyi Tan,['Odelia Tan'],['/profile.php?id=638616238&refid=52&ref=104&__tn__=R']
1,Keith Chia,['Teo Liang Wei'],['/liangweionearth?refid=52&ref=104&__tn__=R']
2,Stella Wai,['Derrick Lim Chu Rui'],['/derricklcr?refid=52&ref=104&__tn__=R']
3,Kei Ng,['Eileen Gwee'],['/eileengwee94?refid=52&ref=104&__tn__=R']
4,Jerdenson Von Einzbern,['Mayumi Yanami Isabelle'],['/mayumi.y.ii?refid=52&ref=104&__tn__=R']
5,Celest Neo,['Glenn Chow'],['/peacockcockcocksocute?refid=52&ref=104&__tn__=R']
6,Kapil Dtani,"['Dan', 'Jade']","['/danstoneman?refid=52&ref=104&__tn__=R', '/jade.halford?refid=52&ref=104&__tn__=R']"
7,Kylee Thio,['Takahiro Tan'],['/takahiro.tan.7?refid=52&ref=104&__tn__=R']
8,Winifred Liang,['Matthew Chaisit Tan'],['/matthewtanljc?refid=52&ref=104&__tn__=R']
9,Jasmine Tan,['Shawn Lee'],['/shawn.lee.1029?refid=52&ref=104&__tn__=R']


In [51]:
# Unique (User) Comments with user taggings = 1337
commenter_taggings_hash = {}

for page in page_list:
    for post, comments in collated_data[page]['comments'].items():       
        for comment_id, comment_content in comments.items():  
            
            if comment_content['Comment_User_Tagging'] != '':
                commenter = comment_content['Commenter']
                user_taggings = comment_content['Comment_User_Tagging'].split(', ')
                
                if commenter not in commenter_taggings_hash.keys():
                    commenter_taggings_hash[commenter] = {}
                    
                for user_tagging in user_taggings:
                    if user_tagging not in commenter_taggings_hash[commenter].keys():     
                        commenter_taggings_hash[commenter][user_tagging] = 0
                       
                    commenter_taggings_hash[commenter][user_tagging] += 1

In [52]:
d = {'Commenter':[], 'Comment_User_Tagging':[], 'Count':[]}

for commenter, user_taggings in commenter_taggings_hash.items():
    for user_tagging, count in user_taggings.items():
        d['Commenter'].append(commenter)
        d['Comment_User_Tagging'].append(user_tagging)
        d['Count'].append(count)
    
df = pd.DataFrame(data=d)
df.style

Unnamed: 0,Commenter,Comment_User_Tagging,Count
0,Baoyi Tan,Odelia Tan,1
1,Keith Chia,Teo Liang Wei,1
2,Stella Wai,Derrick Lim Chu Rui,1
3,Kei Ng,Eileen Gwee,1
4,Jerdenson Von Einzbern,Mayumi Yanami Isabelle,1
5,Celest Neo,Glenn Chow,1
6,Kapil Dtani,Dan,1
7,Kapil Dtani,Jade,1
8,Kylee Thio,Takahiro Tan,1
9,Winifred Liang,Matthew Chaisit Tan,1


In [58]:
page_commenter_taggings_hash = {}

for page in page_list:
    page_commenter_taggings_hash[page] = {}
    
    for post, comments in collated_data[page]['comments'].items():       
        for comment_id, comment_content in comments.items():  
            
            if comment_content['Comment_User_Tagging'] != '':
                commenter = comment_content['Commenter']
                user_taggings = comment_content['Comment_User_Tagging'].split(', ')
                
                if commenter not in page_commenter_taggings_hash[page].keys():
                    page_commenter_taggings_hash[page][commenter] = {}
                    
                for user_tagging in user_taggings:
                    if user_tagging not in page_commenter_taggings_hash[page][commenter].keys():     
                        page_commenter_taggings_hash[page][commenter][user_tagging] = 0
                       
                    page_commenter_taggings_hash[page][commenter][user_tagging] += 1

for x, y in page_commenter_taggings_hash.items():
    print(x, len(y))

DollarsAndSense 222
MortgageConsultancy 0
Seedly 165
WokeManSalary 976


In [56]:
d = {'Commenter':[], 'Comment_User_Tagging':[], 'Count':[]}

# DollarsAndSense
for commenter, user_taggings in page_commenter_taggings_hash['DollarsAndSense'].items():
    for user_tagging, count in user_taggings.items():
        d['Commenter'].append(commenter)
        d['Comment_User_Tagging'].append(user_tagging)
        d['Count'].append(count)   
    
df = pd.DataFrame(data=d)
df.style

Unnamed: 0,Commenter,Comment_User_Tagging,Count
0,Baoyi Tan,Odelia Tan,1
1,Keith Chia,Teo Liang Wei,1
2,Stella Wai,Derrick Lim Chu Rui,1
3,Kei Ng,Eileen Gwee,1
4,Jerdenson Von Einzbern,Mayumi Yanami Isabelle,1
5,Celest Neo,Glenn Chow,1
6,Kapil Dtani,Dan,1
7,Kapil Dtani,Jade,1
8,Kylee Thio,Takahiro Tan,1
9,Winifred Liang,Matthew Chaisit Tan,1


In [59]:
d = {'Commenter':[], 'Comment_User_Tagging':[], 'Count':[]}

# MortgageConsultancy
for commenter, user_taggings in page_commenter_taggings_hash['MortgageConsultancy'].items():
    for user_tagging, count in user_taggings.items():
        d['Commenter'].append(commenter)
        d['Comment_User_Tagging'].append(user_tagging)
        d['Count'].append(count)   
    
df = pd.DataFrame(data=d)
df.style

Unnamed: 0,Commenter,Comment_User_Tagging,Count


In [60]:
d = {'Commenter':[], 'Comment_User_Tagging':[], 'Count':[]}

# Seedly
for commenter, user_taggings in page_commenter_taggings_hash['Seedly'].items():
    for user_tagging, count in user_taggings.items():
        d['Commenter'].append(commenter)
        d['Comment_User_Tagging'].append(user_tagging)
        d['Count'].append(count)   
    
df = pd.DataFrame(data=d)
df.style

Unnamed: 0,Commenter,Comment_User_Tagging,Count
0,Zulius Lorando,K Shanmugam Sc,1
1,Lenney Leong,Brett Le Ming,1
2,Natalie Lee,Jimmy Quek,2
3,Natalie Lee,Wei Liang,1
4,Dacia Kee,Michelle Min,1
5,Ann Gee,Anthea Chow,1
6,Manjeet Baig,Sehr Kashif,1
7,Eudy Leong,Leong Jin Kai,1
8,Eudy Leong,Leong Wan Xuan,1
9,Yeo Jie Ying,Dennis Zehao,1


In [61]:
d = {'Commenter':[], 'Comment_User_Tagging':[], 'Count':[]}

# WokeManSalary
for commenter, user_taggings in page_commenter_taggings_hash['WokeManSalary'].items():
    for user_tagging, count in user_taggings.items():
        d['Commenter'].append(commenter)
        d['Comment_User_Tagging'].append(user_tagging)
        d['Count'].append(count)   
    
df = pd.DataFrame(data=d)
df.style

Unnamed: 0,Commenter,Comment_User_Tagging,Count
0,Isaac Derwin Nicolas Macabuhay,Jane Olavire,1
1,Leigh Deliva,Daxteezy,1
2,Aaron Loh Xian Wei,Ken Lim,1
3,Teo Chek Yew,#staywoke,1
4,April Grace Tangag,Rodel Tangag,1
5,Chong JenFoong,Shu Ting Amelia,1
6,James Lim,The Woke Salaryman,1
7,Rezwan Hasan,ফাইজুস সালেহীন,1
8,Aubrey Flora,Raven Roi Laumoc,1
9,Zatriani Zade Nobleza,Chivee Botz Røa,1


## 1.4.2 Write User Tagging Data

In [36]:
def write_nodes(writer, node):
    writer.write('  node\n')
    writer.write('  [\n')
    writer.write('    id ' + str(node['id']) + '\n')
    writer.write('    label "' + node['label'] +'"\n')
    writer.write('  ]\n')
    
def write_edges(writer, edge):
    writer.write('  edge\n')
    writer.write('  [\n')
    writer.write('    source ' + str(edge['source']) + '\n')
    writer.write('    target ' + str(edge['target']) + '\n')
    writer.write('    value ' + str(edge['value']) + '\n')
    writer.write('  ]\n')

### Collated User Tagging Data

In [37]:
user_dict = {}

index = 0
for user in commenter_list + user_tagging_list:
    if user not in user_dict.keys():
        user_dict[user] = index
        index += 1

In [38]:
node_list = []

for user, user_id in user_dict.items():
    node_list.append({
        'id': user_id,
        'label': user
    })
    
print('Number of nodes:', len(node_list))

Number of nodes: 2907


In [39]:
edge_list = []

for commenter, user_taggings in commenter_taggings_hash.items():
    for user_tagging, count in user_taggings.items():
         edge_list.append({
             'source': user_dict[commenter],
             'target': user_dict[user_tagging],
             'value': count
         })
            
print('Number of edges:', len(edge_list))

Number of edges: 1610


In [40]:
filename = 'Facebook_Comments_User_Tagging_Collated.gml'
output_file = open(ROOT_GEPHI_DATA_FOLDER + filename, "w")

output_file.write("graph\n")
output_file.write("[\n")

for node in node_list:
    write_nodes(output_file, node)
    
for edge in edge_list:
    write_edges(output_file, edge)

output_file.write("]\n")
output_file.close()

print('Writing of', filename, 'Done!')

Writing of Facebook_Comments_User_Tagging_Collated.gml Done!
