In [1]:
import networkx as nx
import datetime
import glob

In [2]:
save_path="save_graphs\\time_graph\\"

# {"original_tweet's user ID": [tweets that were replies to the original tweets by this user]}
in_reply_to_user_dict = {}

# {"original Tweet’s ID": [tweets that were replies to the original tweet]}
in_reply_to_tweet_dict = {}

user_dict_file = save_path+"in_reply_to_user_dict_delete.txt"
tweet_dict_file = save_path+"in_reply_to_tweet_dict_delete.txt"
edges_record = save_path+"edge_record.txt"

In [3]:
tweet_set = set()
with open(save_path+"check_connected_components.txt","r") as check_connected:
    for element in check_connected:
        tweet_set.add(element[:-1])
len(tweet_set)

577710

In [4]:
if '1364413096819896320' in tweet_set:
    print("yes present")

yes present


In [5]:
def total_records(file_name):
    """
    returns the total number of tweets in the file
    we are having a single tweet per row - each having a unique tweet id
    """
    with open(file_name,"r") as node_list:
        records=0
        for rows in node_list:
            records+=1
        return records

In [6]:
def printl(text):
    with open(save_path+"time-graph-logs.txt","a") as log_file:
        log_file.write(f"{datetime.datetime.now().strftime('%c')}   {text}")
        log_file.write("\n")
        
def write_dicts():
    with open(user_dict_file,"w") as user_dict:
        for user_id in in_reply_to_user_dict.keys():
            user_dict.write(user_id+"\t"+str(in_reply_to_user_dict[user_id]))
            user_dict.write("\n")
            
    with open(tweet_dict_file,"w") as tweet_dict:
        for tweet_id in in_reply_to_tweet_dict.keys():
            tweet_dict.write(tweet_id+"\t"+str(in_reply_to_tweet_dict[tweet_id]))
            tweet_dict.write("\n")

def get_int_timestamp(dt_string):
    dt_items = dt_string.split(" ")
    date_string = dt_items[0]+"-"+dt_items[1]+"-"+dt_items[2]+"-"+dt_items[5]+"-"+dt_items[3]
    date_object = datetime.datetime.strptime(date_string, "%a-%b-%d-%Y-%H:%M:%S")
    return int(date_object.timestamp())

def look_at_all_files(func):
    def inner(*args, **kwargs):
        printl(f"looking at .tsv files in directory {args[0]} for [{args[1]}]")
        files = glob.glob(args[0]+'[hydrated-tweets]*.tsv')
        for file in files:
            kwargs["file_name"]=file
            func(*args, **kwargs)
    return inner

def add_edge_record(source,target,first="no", time=None, tweet_relation=None):
    with open(edges_record,"a") as edge:
        if first=="yes":
            edge.write("source"+"\t"+"destination"+"\t"+"time"+"\t"+"tweet_relation")
            edge.write("\n")
        elif time is not None:
            edge.write(source+"\t"+target+"\t"+time+"\t"+"none")
            edge.write("\n")
        elif tweet_relation is not None:
            edge.write(source+"\t"+target+"\t"+"none"+"\t"+tweet_relation)
            edge.write("\n")
            
add_edge_record("a","b",first="yes")

In [7]:
@look_at_all_files
def add_tweet_nodes(directory, purpose, **kwargs):
    """
    it will go over all the tsv tweet files in data folder and add
    each tweet as a node in graph
    
    calling format
    add_tweet_nodes(directory, purpose, graph="input_graph")
    """
    
    file_name = kwargs["file_name"]
    graph = kwargs["graph"]
    printl(f"total records in {file_name} are {total_records(file_name)}")
    
    with open(file_name,"r") as node_list:
        for rows in node_list:
            
            cols = rows.split("\t")
            if cols[0] == "tweet_id":
                continue
            if cols[0] in tweet_set:
                graph.add_node(cols[0])
                if cols[4] != "":
                    if in_reply_to_user_dict.get(cols[4], "not added") == "not added":
                        in_reply_to_user_dict[cols[4]]=[(cols[0],cols[5])]
                    else:
                        in_reply_to_user_dict[cols[4]].append((cols[0],cols[5]))

                if cols[3] != "":
                    #if the represented Tweet is a reply
                    #this field will contain the string representation of the original Tweet’s ID.
                    #so that means edge can go from current tweet ID (later) to in_reply_to_status_id_str (earlier) 
                    #the tweet node part of edge will automatically be added to graph if not present
                    graph.add_edge(cols[3],cols[0])
                    add_edge_record(cols[3],cols[0],tweet_relation="yes")

                    if in_reply_to_tweet_dict.get(cols[3], "not added") == "not added":
                        in_reply_to_tweet_dict[cols[3]]=[cols[0]]
                    else:
                        in_reply_to_tweet_dict[cols[3]].append(cols[0])

In [8]:
@look_at_all_files
def make_edges_based_user(directory, purpose, **kwargs):
    """
    it will go over all the tsv tweet files in data folder and add
    edges based on common user between tweets
    
    calling format
    make_edges_based_user(directory, purpose, graph="input_graph")
    """
    #If the represented Tweet is a reply
    #this field will contain the string representation of the original Tweet’s author ID.
    #in_reply_to_user_id_str
    
    # add edges in the graph based on created at timestamp
    
    file_name = kwargs["file_name"]
    graph = kwargs["graph"]
    
    with open(file_name,"r") as node_list:
        count=0
        for rows in node_list:
            count+=1
            if count%20000 == 0:
                printl(f"making edges - have read {count} entries in {file_name}")
            cols = rows.split("\t")
            if cols[0] == "tweet_id":
                continue
            if in_reply_to_user_dict.get(cols[2], "not found") != "not found":
                tweet_1 = cols[0]#tweet ID of tweet by user
                tweet_1_created_at = cols[5]#created at time of tweet by user
                for tweet_time in in_reply_to_user_dict.get(cols[2]): #we are trying to find user_id of current tweet in the dict
                    if get_int_timestamp(tweet_1_created_at)>get_int_timestamp(tweet_time[1]):
                        #tweet_1 is at later time
                        #edge from earlier to later
                        graph.add_edge(tweet_time[0], tweet_1)
                        add_edge_record(tweet_time[0], tweet_1, time=str(tweet_time[1])+"$"+str(tweet_1_created_at))
                    elif get_int_timestamp(tweet_1_created_at) == get_int_timestamp(tweet_time[1]):
                        #do not consider these nodes
                        #do not add edge
                        with open(save_path+"same_created_time_tweets_delete.txt","a") as same_time_tweets:
                            same_time_tweets.write(f"tweet 1 - {tweet_1}    tweet 2 - {tweet_time[0]}    created_at_time1 - {tweet_1_created_at}    created_at_time2 - {tweet_time[1]}")
                            same_time_tweets.write("\n")
                    else:
                        #tweet 2 at later time
                        graph.add_edge(tweet_1, tweet_time[0])
                        add_edge_record(tweet_1, tweet_time[0], time=str(tweet_1_created_at)+"$"+str(tweet_time[1]))

In [9]:
G = nx.DiGraph()

In [None]:
# add to graph all the tweet nodes

# iterate through each day's tsv file and add node for each tweet id str
# add edges from tweet nodes to original tweets which were replied to
add_tweet_nodes("data1\\", "adding tweet nodes", graph=G)
printl("added the tweet nodes")
# write the dicts to disk
write_dicts()

printl(f"the Total number of entries from all tsv files in in_reply_to_user_dict keys is {len(in_reply_to_user_dict.keys())}")
printl(f"the Total number of entries from all tsv files in in_reply_to_tweet_dict keys is {len(in_reply_to_tweet_dict.keys())}")


# make edges based on common users
# directed based on created at time
make_edges_based_user("data\\", "make edges based on common users", graph=G)
printl("added edges among tweet nodes based on user relation")

printl(f"the tweets which were being connected based on common user relation but were dropped due to same creation time entry - {total_records(save_path+'same_created_time_tweets.txt')}")

In [10]:
nx.write_gpickle(G, save_path+"time_graph_delete.gpickle")