In [None]:
# Libs
from twython import Twython, TwythonError, TwythonRateLimitError
import libs.preprocessor as tweet_preproc

# Init Preprocessor
twitterPreprocessor = tweet_preproc.TwitterPreprocessor()

In [None]:
# Number of rows in a file
def file_len(fname):
    
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    
    nbrOfLines = i + 1
    print("Nbr of lines : " + str(nbrOfLines))
    
    return nbrOfLines

In [None]:
from credentials import *
twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, oauth_token, oauth_secret)

# Traverse Source Datasets

The election-filter*.txt dataset only contain the tweet id. We have no idea when these tweets were posted. For this reason, we traverse at regular increment all the ids in all the datasets and query their created_at attribute.

In [None]:
def traverse_ids(filename,
                 database_name,
                 nb_requests=500):  # less than 900 because many tweet lookup fail
    
    print(" --- " + filename + " --- ")
    
    # Count number of lines in file
    linesCount = file_len(filename)
    
    # Opening the ID File:
    file = open(filename, "r")
    
    # compute the lookup increment
    lookup_inc = round(linesCount/(1.0*nb_requests))
    
    # Creation of the file that will contain the hydrated tweets:
    with open(database_name, 'w+', newline='', encoding="utf-8") as csvfile:
        
        # First line write the headers
        csvfile.write("row,created_at\n")
        
        # Go through row
        row_counter = 0
        retryNext = False
        for row in file:

            # Do every inc
            if((row_counter % lookup_inc) == 0 or retryNext == True):

                # Reset
                retryNext = False

                # Strip
                row = str(row).strip()
                
                try:
                    # Get Status
                    status = twitter.show_status(id=row)
                    
                    # Get created_at
                    created_at = status['created_at']

                    # Parse it
                    created_at = twitterPreprocessor.to_datetime(created_at)

                    # Append to csv
                    csvfile.write(str(row_counter) + "," + str(created_at) + "\n")
                    

                except TwythonError as e:
                    
                    retryNext = True
                    
                    if isinstance(e, TwythonRateLimitError):
                        
                        print("sleeping for 15 min and 30 seconds")
                        time.sleep(930)
                
                
            # increment row counter
            row_counter = row_counter + 1
            
                     
        file.close()

In [None]:
collections = ["sources/general/election-filter1.txt","sources/general/election-filter2.txt","sources/general/election-filter3.txt","sources/general/election-filter4.txt","sources/general/election-filter5.txt","sources/general/election-filter6.txt"]

In [None]:
collection_id = 0

database_name = "sources/general/traversed/election-filter" + str(collection_id+1) + ".txt";

traverse_ids(collections[collection_id],database_name,nb_requests=500)

# Merge Traverses

In [None]:
from glob import glob

# Grab all the traversed files
filenames = glob('sources/general/traversed/traverse/*')

# Merged output file
output_name = "databases/all_traverse.csv"

with open(output_name, 'w+') as outfile:
    
    # Write header
    outfile.write("file,row,created_at\n")
    
    for fname in filenames:
        with open(fname) as infile:
            
            name = fname.split("/")[-1]
            skipFirst = True
            
            for line in infile:
                
                if(skipFirst):
                    skipFirst = False
                    continue
                
                line = line.strip()
                line = name + "," + line + "\n"
                outfile.write(line)
                
                


## Order by created_at

In [None]:
# Ordered
ordered_output_name = "databases/all_traverse_ordered.csv"

def x_order(x):
    
    x_createdAt = x['created_at']
    x_row = int(x['row'])
    
    toDigit = x_row/(10**12)
    
    x_createdAt = x_createdAt.split("-")
    x_createdAt = int(x_createdAt[2]) + int(x_createdAt[1])*100 + int(x_createdAt[0])*10000
    
    orderX = x_createdAt + toDigit
    
    return orderX
    

with open(ordered_output_name, 'w+') as outfile:
    with open(output_name, 'r', newline='', encoding="utf-8") as csvfile:
        
        # Write header
        outfile.write("file,row,created_at\n")
        
        # init reader
        reader = csv.reader(csvfile, delimiter=',')
        
        # Taking the header of the file + the index of useful columns:
        header = next(reader)
        ind_createdAt = header.index('created_at')
        ind_row = header.index('row')
        ind_file = header.index('file')
        
        # go through rows
        allData = []
        for row in reader:
            
            # apppend
            allData.append({
                "created_at":row[ind_createdAt],
                "row": row[ind_row],
                "file": row[ind_file]
            })
            
        
        # Sort
        sortedData = sorted(allData, key=lambda x: x_order(x))
        
        # Write to output file
        for datum in sortedData:
            
            line = datum['file'] + "," + datum['row'] + "," + datum['created_at'] + "\n"
            outfile.write(line)