#### 1. Importing necessary libraries 

In [1]:
import pandas as pd

#### 2. Reading the data file into a dataframe and storing inside a variable

In [2]:
main_df = pd.ExcelFile('HugoTikTokComments.xlsx')
#Display the list of worksheets in the file
len(main_df.sheet_names)

209

In [3]:
#Example of data in one of the worksheets in a 2d format
main_df.parse(sheet_name = "2020-09-11") 

Unnamed: 0,Now,Tue Mar 22 2022 20:38:24 GMT+0800 (Singapore Standard Time),Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Post URL,https://www.tiktok.com/@hugo.nerman/video/6871...,,,,,,,,,
1,Publisher Nickname,Hugo Nerman,,,,,,,,,
2,Publisher URL,https://www.tiktok.com/@hugo.nerman,,,,,,,,,
3,Publisher @,hugo.nerman,,,,,,,,,
4,Publish Time,11-9-2020,,,,,,,,,
5,Post Likes,2666,,,,,,,,,
6,Description,"First TikTok, don’t let it flop. #cars #racing...",,,,,,,,,
7,Number of 1st level comments,19,,,,,,,,,
8,Number of 2nd level comments,11,,,,,,,,,
9,"Total Comments (actual, in this list, rendered...",30,,,,,,,,,


#### 3. Reading each worksheet into a dataframe and storing inside an array

In [6]:
frames = []
#Looping through each worksheet and selecting only four columns
for i in range(len(main_df.sheet_names)):
    temp_variable = pd.DataFrame(main_df.parse(sheet_name = main_df.sheet_names[i]),columns=['Unnamed: 2','Unnamed: 6','Unnamed: 8','Unnamed: 10'])
    #Renaming the columns with proper names
    temp_variable.rename(columns = {'Unnamed: 2':'Username', 'Unnamed: 6':'No_of_likes', 'Unnamed: 8':'Second_Level_Comment', 'Unnamed: 10':'No_of_replies'}, inplace = True)
    frames.append(temp_variable)

In [7]:
#Example of data in one of the worksheets in a 2d format
frames[0]

Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


#### 4. Combining all worksheets into one dataframe

In [8]:
combined_df = pd.concat(frames, join="inner", sort=False)
combined_df

Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
531,thembhakau,0,No,0
532,eoinmcgowan_11,1,No,0
533,antony.dimauro,1,No,0
534,doggochoppa,0,No,0


#### 5.  Removing empty rows and old headers

In [65]:
#Removing rows with no values in Username column
combined_df.dropna(subset=['Username'], inplace=True)

#Removing rows with 'User @' in Username column
unique_df = combined_df[combined_df.Username != 'User @']

#unique_df1 = unique_df[unique_df.Second_Level_Comment != 'Yes']

unique_df

Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies
13,stl.mbl,7,No,0
14,_lasselosgodis_,6,No,1
15,hugo.nerman,12,Yes,---
16,the_backstaber,2,No,0
17,harrisonnn.d,2,No,3
...,...,...,...,...
531,thembhakau,0,No,0
532,eoinmcgowan_11,1,No,0
533,antony.dimauro,1,No,0
534,doggochoppa,0,No,0


#### 6.  Removing duplicated Usernames and accumulating No_of_Likes

In [67]:
#Extracting duplicates and then grouping them together
duplicates = pd.concat(g for _, g in unique_df.groupby('Username') if len(g) > 1)
duplicates

#Accumulating No_of_Likes
...

Ellipsis

In [68]:
duplicates['No_of_likes'] = duplicates['No_of_likes'].astype(int)
duplicates['Total_likes'] = duplicates.groupby(['Username'])['No_of_likes'].transform('sum')

In [69]:
temp_dict = {}

for index, row in duplicates.iterrows():
  if row['Username'] not in temp_dict:
    temp_dict[row['Username']] = row['Second_Level_Comment']
  elif (row['Username'] not in temp_dict) and (temp_dict[row['Username']] != "No"):
    temp_dict[row['Username']] = row['Second_Level_Comment']

for k,v in temp_dict.items():
  if v == 'Yes':
    temp_dict[k] = 2
  else:
    temp_dict[k] = 1

# duplicates = duplicates.assign(degree_level = temp_dict[duplicates['Second_Level_Comment']])
duplicates['degree_level'] = duplicates.apply(
    lambda row: temp_dict[row.Username], axis=1)

duplicates

Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies,Total_likes,degree_level
18,..ipe,0,Yes,---,4,2
17,..ipe,4,No,1,4,2
272,..ipe,0,No,0,4,2
27,.ayayayayayayayayayaya,1,Yes,---,3,2
25,.ayayayayayayayayayaya,2,Yes,---,3,2
...,...,...,...,...,...,...
14,zschulz,3,Yes,---,4,1
78,zvenzon,0,No,0,4,1
22,zvenzon,4,No,0,4,1
40,zzz_364,0,No,0,0,1


In [71]:
new_df = duplicates.drop_duplicates(subset=['Username'])
new_df

Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies,Total_likes,degree_level
18,..ipe,0,Yes,---,4,2
27,.ayayayayayayayayayaya,1,Yes,---,3,2
16,.davoman,4,Yes,---,9,2
51,.deannaaa.x,1,No,0,2,1
574,.dropcake,0,No,2,0,1
...,...,...,...,...,...,...
24,zoeyeracing,6,No,0,115,1
58,zs1kr,0,No,0,0,1
124,zschulz,1,No,0,4,1
78,zvenzon,0,No,0,4,1


In [73]:
#Drops all duplicates
unique_df.sort_values("Username", inplace = True)
unique_df.drop_duplicates(subset ="Username",keep = False, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [75]:
unique_df['Total_likes'] = unique_df.groupby(['Username'])['No_of_likes'].transform('sum')
unique_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['Total_likes'] = unique_df.groupby(['Username'])['No_of_likes'].transform('sum')


Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies,Total_likes
183,.......owen,0,No,0,0
75,......nahman,0,No,0,0
36,....drip,3,Yes,---,3
13,..auzzy,8,No,0,8
97,..cashmoneybaby,0,No,0,0
...,...,...,...,...,...
65,zyjami,0,No,0,0
198,zyruz,0,No,0,0
257,zyusuf322,0,No,0,0
284,zzz_zad,1,Yes,---,1


In [76]:
temp_dict = {}

for index, row in unique_df.iterrows():
  if row['Username'] not in temp_dict:
    temp_dict[row['Username']] = row['Second_Level_Comment']
  elif (row['Username'] not in temp_dict) and (temp_dict[row['Username']] != "No"):
    temp_dict[row['Username']] = row['Second_Level_Comment']

for k,v in temp_dict.items():
  if v == 'Yes':
    temp_dict[k] = 2
  else:
    temp_dict[k] = 1

# duplicates = duplicates.assign(degree_level = temp_dict[duplicates['Second_Level_Comment']])
unique_df['degree_level'] = unique_df.apply(
    lambda row: temp_dict[row.Username], axis=1)

unique_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['degree_level'] = unique_df.apply(


Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies,Total_likes,degree_level
183,.......owen,0,No,0,0,1
75,......nahman,0,No,0,0,1
36,....drip,3,Yes,---,3,2
13,..auzzy,8,No,0,8,1
97,..cashmoneybaby,0,No,0,0,1
...,...,...,...,...,...,...
65,zyjami,0,No,0,0,1
198,zyruz,0,No,0,0,1
257,zyusuf322,0,No,0,0,1
284,zzz_zad,1,Yes,---,1,2


In [77]:
#Adding them back to the original dataframe
frames = [unique_df, new_df]
  
result = pd.concat(frames)
result

Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies,Total_likes,degree_level
183,.......owen,0,No,0,0,1
75,......nahman,0,No,0,0,1
36,....drip,3,Yes,---,3,2
13,..auzzy,8,No,0,8,1
97,..cashmoneybaby,0,No,0,0,1
...,...,...,...,...,...,...
24,zoeyeracing,6,No,0,115,1
58,zs1kr,0,No,0,0,1
124,zschulz,1,No,0,4,1
78,zvenzon,0,No,0,4,1


In [78]:
# result.drop("No_of_likes", axis=1, inplace=True)
result.drop("Second_Level_Comment", axis=1, inplace=True)


In [79]:
result

Unnamed: 0,Username,No_of_likes,No_of_replies,Total_likes,degree_level
183,.......owen,0,0,0,1
75,......nahman,0,0,0,1
36,....drip,3,---,3,2
13,..auzzy,8,0,8,1
97,..cashmoneybaby,0,0,0,1
...,...,...,...,...,...
24,zoeyeracing,6,0,115,1
58,zs1kr,0,0,0,1
124,zschulz,1,0,4,1
78,zvenzon,0,0,4,1


In [80]:
result.to_csv('network_result.csv', index=False)

In [81]:
import sys, time

def progress(v):
    v = str(v)
    sys.stdout.flush()
    sys.stdout.write('\r')
    sys.stdout.flush()
    sys.stdout.write(v)

In [83]:
f = open("output.gml", "w")
#helpers
s = " "
ss = s+s
sss = s+s+s
ssss = s+s+s+s
nl = "\n"

#loop helpers
added = []
ind = 0

#Root node
f.write("graph"+nl)
f.write("["+nl)

#Write an edge
def write_edge(r):
    f.write( ss + "edge" + nl)
    f.write( ss + "[" + nl)
    f.write( ssss + "source" + s + '"' + str(r['AUTHOR_ID']) + '"' + nl)
    f.write( ssss + "target" + s + '"' + str(r['CO-AUTHOR_ID']) + '"' + nl)
    f.write( ssss + "value" + s + str(r['NO_OF_BOOKS']) + '"' + nl)
    f.write( ss + "]"+ nl)

#Write a node
def write_node(r):
    f.write( ss + "node" + nl)
    f.write( ss + "[" + nl)
    f.write( ssss + "id" + s + '"' + str(ind) + '"' + nl)
    f.write( ssss + "label" + s + '"' + str(r['Username']) + '"' + nl)
    f.write( ssss + "value" + s + '"' + str(r['Total_likes']) + '"' + nl)
    f.write( ss + "]"+ nl)

#Generate nodes
for i, r in result.iterrows():
    #increment, as index not reliable
    ind += 1
    #Check for duplicates
    if (r['Username'] not in added):
        #Add to list
        added.append(r['Username'])
        write_node(r)
    #print the progress    
    progress(ind)

print(nl+"Printing nodes over")

#flush index
ind = 0    
#Generate edges    
for i, r in df.iterrows():
    #increment, as index not reliable
    ind += 1
    if(r['AUTHOR_ID'] < r['CO-AUTHOR_ID']):
        write_edge(r)
    #print the progress            
    progress(ind)

print(nl+"Printing nodes and edges over")

#closing node
f.write("]"+nl)
f.close()

1500

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



3358

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



5213

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



7168

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



8964

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



10887

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



12756

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



14630

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



16364

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [66]:
#unique_df.sort_values("Username", inplace=True)
#unique_df.drop_duplicates(subset = ["Username"],keep = 'first')

#unique_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,Username,No_of_likes,Second_Level_Comment,No_of_replies
183,.......owen,0,No,0
75,......nahman,0,No,0
36,....drip,3,Yes,---
13,..auzzy,8,No,0
97,..cashmoneybaby,0,No,0
...,...,...,...,...
257,zyusuf322,0,No,0
40,zzz_364,0,No,0
73,zzz_364,0,No,0
284,zzz_zad,1,Yes,---
