In [1]:
import networkx as nx
import datetime
import glob

In [2]:
save_path="save_graphs\\heterogenous_graph\\"

# {"original_tweet's user ID": [tweets that were replies to the original tweets by this user]}
in_reply_to_user_dict = {}

# {"original Tweet’s ID": [tweets that were replies to the original tweet]}
in_reply_to_tweet_dict = {}

user_dict_file = save_path+"in_reply_to_user_dict.txt"
tweet_dict_file = save_path+"in_reply_to_tweet_dict.txt"

In [3]:
def total_records(file_name):
    """
    returns the total number of tweets in the file
    we are having a single tweet per row - each having a unique tweet id
    """
    with open(file_name,"r") as node_list:
        records=0
        for rows in node_list:
            records+=1
        return records

In [4]:
def printl(text):
    with open(save_path+"heterogenous-graph-logs.txt","a") as log_file:
        log_file.write(f"{datetime.datetime.now().strftime('%c')}   {text}")
        log_file.write("\n")
        
def write_dicts():
    with open(user_dict_file,"w") as user_dict:
        for user_id in in_reply_to_user_dict.keys():
            user_dict.write(user_id+"\t"+str(in_reply_to_user_dict[user_id]))
            user_dict.write("\n")
            
    with open(tweet_dict_file,"w") as tweet_dict:
        for tweet_id in in_reply_to_tweet_dict.keys():
            tweet_dict.write(tweet_id+"\t"+str(in_reply_to_tweet_dict[tweet_id]))
            tweet_dict.write("\n")

def look_at_all_files(func):
    def inner(*args, **kwargs):
        printl(f"looking at .tsv files in directory {args[0]} for [{args[1]}]")
        files = glob.glob(args[0]+'[hydrated-tweets]*.tsv')
        for file in files:
            kwargs["file_name"]=file
            func(*args, **kwargs)
    return inner

In [5]:
#edge in graph will be
# tweet to original tweet (based on reply to original tweet relation)
# tweet to user who posted the tweet (based on user who made the tweet relation)
# tweet to original user (based on tweet made as a reply to original tweet by original user)
# tweet to another tweet (based on common user relation - tweet posted in
#     reply to original user - related to other tweets of the user)
# user to original user (based on reply to original user relation)


@look_at_all_files
def add_all_nodes(directory, purpose, **kwargs):
    """
    it will go over all the tsv tweet files in data folder and 
    i) add each tweet as a node in graph
    ii) add each user as a node in graph
    
    calling format
    add_tweet_nodes(directory, purpose, graph="input_graph")
    """
    
    file_name = kwargs["file_name"]
    graph = kwargs["graph"]
    printl(f"total records in {file_name} are {total_records(file_name)}")
    
    with open(file_name,"r") as node_list:
        for rows in node_list:
            
            cols = rows.split("\t")
            if cols[0] == "tweet_id":
                continue
            graph.add_node(cols[0]) # add tweet node
            graph.add_node('$'+cols[2]) # add user node
            
            # tweet to user who posted the tweet (based on user who made the tweet relation)
            graph.add_edge(cols[0], '$'+cols[2])
            
            if cols[4] != "":
                # tweet to original user (based on tweet made as a reply to original tweet by original user)
                graph.add_edge(cols[0], '$'+cols[4])
                
                # user to original user (based on reply to original user relation)
                graph.add_edge(cols[2], '$'+cols[4])
                
                if in_reply_to_user_dict.get('$'+cols[4], "not added") == "not added":
                    in_reply_to_user_dict['$'+cols[4]]=[(cols[0],cols[5])]
                else:
                    in_reply_to_user_dict['$'+cols[4]].append((cols[0],cols[5]))
                    
            if cols[3] != "":
                #if the represented Tweet is a reply
                #this field will contain the string representation of the original Tweet’s ID.
                #undirected graph
                #so that means edge between current tweet ID (later) and in_reply_to_status_id_str (earlier) 
                #the tweet node part of edge will automatically be added to graph if not present
                
                # tweet to original tweet (based on reply to original tweet relation)
                graph.add_edge(cols[3],cols[0])
                
                if in_reply_to_tweet_dict.get(cols[3], "not added") == "not added":
                    in_reply_to_tweet_dict[cols[3]]=[cols[0]]
                else:
                    in_reply_to_tweet_dict[cols[3]].append(cols[0])
            

In [6]:
@look_at_all_files
def make_edges_based_user(directory, purpose, **kwargs):
    """
    it will go over all the tsv tweet files in data folder and add
    edges based on common user between tweets
    
    calling format
    make_edges_based_user(directory, purpose, graph="input_graph")
    """
    #If the represented Tweet is a reply
    #this field will contain the string representation of the original Tweet’s author ID.
    #in_reply_to_user_id_str
    
    file_name = kwargs["file_name"]
    graph = kwargs["graph"]
    
    with open(file_name,"r") as node_list:
        count=0
        for rows in node_list:
            count+=1
            if count%20000 == 0:
                printl(f"making edges - have read {count} entries in {file_name}")
            cols = rows.split("\t")
            if cols[0] == "tweet_id":
                continue
            if in_reply_to_user_dict.get('$'+cols[2], "not found") != "not found":
                tweet_1 = cols[0]#tweet ID of tweet by user
                for tweet_time in in_reply_to_user_dict.get('$'+cols[2]):
                    graph.add_edge(tweet_time[0], tweet_1)

In [7]:
G = nx.Graph()

In [8]:
# add to graph all the tweet nodes

# iterate through each day's tsv file and add node for each tweet id str
add_all_nodes("data\\", "adding all nodes", graph=G)
printl("added the tweet nodes")
# write the dicts to disk
write_dicts()

printl(f"the Total number of entries from all tsv files in in_reply_to_user_dict keys is {len(in_reply_to_user_dict.keys())}")
printl(f"the Total number of entries from all tsv files in in_reply_to_tweet_dict keys is {len(in_reply_to_tweet_dict.keys())}")

# make edges based on common users
# directed based on created at time
make_edges_based_user("data\\", "make edges based on common users", graph=G)
printl("added edges among tweet nodes based on user relation")

In [9]:
nx.write_gpickle(G, save_path+"heterogenous_graph.gpickle")