In [15]:
'''
Task 1.1
Read the file soc-redditHyperlinks-title.tsv once 
and create a tab-separated dictionary file dict.tsv with two columns. 

The second column should include 
    the sorted distinct set of distinct names of subreddits
     that appear either as SOURCE_SUBREDDIT or as TARGET_SUBREDDIT. 
The first column gives a unique integer identifier to each name, 
    based on their order, starting from 0. 
'''
import pandas as pd
filename = "soc-redditHyperlinks-title.tsv"
df = pd.read_csv(filename, sep='\t')

subreddits = set(df['SOURCE_SUBREDDIT'].unique()).union(set(df['TARGET_SUBREDDIT'].unique()))
subreddits = list(subreddits)
subreddits.sort()
# print(subreddits)
# convert the subreddits to a df, with a unique integer identifier
subreddit_df = pd.DataFrame(subreddits, columns=['subreddit'])
subreddit_df['id'] = subreddit_df.index
subreddit_df = subreddit_df[['id', 'subreddit']]
subreddit_df.to_csv('dict.tsv', sep='\t', index=False, header=False)


In [16]:
'''
Task 1.2
Read the file soc-redditHyperlinks-title.tsv once more and 
create a tab-separated graph file with adjacency lists graph.tsv as follows. 

    Each line of graph.tsv should contain a subreddit identifier 
     that has at least one outgoing edge in soc-redditHyperlinks-title.tsv. 
    Next to the identifier, after a tab, there should be space-separated triples of the format: 
     timestamp, destination-id, sentiment, one for each outgoing interaction
     from the source subreddit to the other (target) subreddits. 
      Timestamp is the unix epoch time that corresponds to the timestamp of the interaction and it is sensitive to the timezone. 
       For example, in the timezone of Hong Kong, timestamp 1456025511 corresponds to Sunday, 21 February 2016 11:31:51 GMT+08:00. 
    For each line of graph.tsv, the outgoing interactions should be sorted 
     (primarily by timestamp, secondarily by target subreddit) and 
     duplicate interactions should be eliminated.
'''
import pandas as pd
import pytz

df = pd.read_csv(filename, sep='\t')
graph_df = pd.DataFrame(columns=['subreddit', 'timestamp', 'destination_id', 'emotion'])



def process_group(group):
    outgoing_edges = group[['TIMESTAMP', 'TARGET_SUBREDDIT', 'LINK_SENTIMENT']]
    outgoing_edges = outgoing_edges.drop_duplicates()
    outgoing_edges.columns = ['timestamp', 'destination_id', 'emotion']
    
    outgoing_edges['timestamp'] = outgoing_edges['timestamp'].apply(lambda x: pd.Timestamp(x).timestamp())
    outgoing_edges.insert(0, 'subreddit', group.name)
    return outgoing_edges

grouped = df.groupby('SOURCE_SUBREDDIT')
graph_df = pd.concat([graph_df, grouped.apply(process_group)], ignore_index=True)

# Convert to hk zone
graph_df['timestamp'] = graph_df['timestamp'] - 28800
# Convert to string
graph_df['timestamp'] = graph_df['timestamp'].apply(lambda x: str(int(x)))

print(graph_df)

               subreddit   timestamp   destination_id emotion
0                    007  1456025511     daniel_craig       1
1          07thexpansion  1399709325     visualnovels       1
2       098f6bcd4621d373  1486537717        askreddit       1
3             0________0  1477368034    todayilearned       1
4                0magick  1482323500           occult       1
...                  ...         ...              ...     ...
562627         zyramains  1483350482  leagueoflegends       1
562628         zyramains  1486003374  leagueoflegends       1
562629         zyramains  1491027357   summonerschool       1
562630              zyzz  1401737952     bodybuilding       1
562631              zyzz  1465406908       girlsmirin       1

[562632 rows x 4 columns]


In [17]:
# Read the dict.tsv file and create a dictionary mapping subreddit names to subreddit ids
dict_df = pd.read_csv('dict.tsv', sep='\t', header=None, names=['id', 'subreddit'])
subreddit_dict = dict(zip(dict_df['subreddit'], dict_df['id']))

# Convert the data type of subreddit and destination_id columns to match the data type of the keys in subreddit_dict
graph_df['subreddit'] = graph_df['subreddit'].astype(str)
graph_df['destination_id'] = graph_df['destination_id'].astype(str)

# Replace the subreddit names with the subreddit ids, and fill NaN values with -1
graph_df['subreddit'] = graph_df['subreddit'].map(subreddit_dict).fillna(-1)
graph_df['destination_id'] = graph_df['destination_id'].map(subreddit_dict).fillna(-1)

print(graph_df)

        subreddit   timestamp  destination_id emotion
0               0  1456025511           11584       1
1               1  1399709325           51374       1
2               3  1486537717            3242       1
3               4  1477368034           48212       1
4               5  1482323500           32872       1
...           ...         ...             ...     ...
562627      54072  1483350482           25860       1
562628      54072  1486003374           25860       1
562629      54072  1491027357           44951       1
562630      54073  1401737952            6242       1
562631      54073  1465406908           19155       1

[562632 rows x 4 columns]


In [18]:
print(graph_df.columns)

Index(['subreddit', 'timestamp', 'destination_id', 'emotion'], dtype='object')


In [22]:
def create_adjacency_list(graph_df, output_file):
    adjacency_list = {}

    # 构建邻接表
    for _, row in graph_df.iterrows():
        subreddit = str(row['subreddit'])
        destination_id = str(row['destination_id'])
        timestamp = str(row['timestamp'])
        emotion = str(row['emotion'])
        
        if subreddit not in adjacency_list:
            adjacency_list[subreddit] = []
        
        adjacency_list[subreddit].append(( timestamp, destination_id,emotion))
    
    # 写入邻接表到文件
    with open(output_file, 'w') as f:
        for subreddit, edges in adjacency_list.items():
            destinations = '\t'.join(f"{timestamp},{dest_id},{emotion}" for timestamp, dest_id, emotion in edges)
            line = f"{subreddit}\t{destinations}\n"
            f.write(line)

create_adjacency_list(graph_df, 'graph.tsv')