# Part 6: Discussions-Potential Concerns and Issues

This notebook walks through the log files to determine how many tweets were accessed/inaccessible/etc. For use in my Discussions/Limitations section.

In [9]:
import os 
import pandas as pd
import requests

In [2]:
def getFileContents(path):
    '''
    Opens file as list of lines
    '''
    file = open(path, 'r')
    return file.readlines()

def getTweetCount(line1, line2):
    '''
    Get ratio of tweets and date/time of tweet file
    '''
    words1 = line1.split()
    obtained, accessed = words1[8], words1[11] # index of tweets obtained and tweets
    
    words2 = line2.split()
    fileName = words2[5]
    
    return (obtained, accessed, fileName)

In [3]:
# file = 'collect-05142020-05172020.log'
# basePath = '/students/jw10/cs315/NOT_FOR_USE/local-collection-logs'
# path = f'{basePath}/{file}'
# logArr = getFileContents(path)

def getTweetAttainment(contentsArr):
    '''
    Parses the log file to obtain the ratio of tweets obtained given the size of the original tweet file
    '''
    allObtained = []
    allAccessed = []
    files = []
    i = 0
    while i < len(contentsArr):
    #     print(logArr[i])
        obtained, accessed, fileName = getTweetCount(contentsArr[i], contentsArr[i+1])
        allObtained.append(obtained)
        allAccessed.append(accessed)
        files.append(fileName)
        i += 2
    
    return allObtained, allAccessed, files

In [4]:
totalTransportTweets = []
totalTweetsAccessed = []
totalFiles = []
basePath = '/students/jw10/cs315/NOT_FOR_USE/local-collection-logs'

for file in os.listdir(basePath):
    # don't include general collector log b/c it does not follow the same format
    if file == 'general-collector-script.log':
        continue
    
    # begin appending
#     path = f'{basePath}/{file}'
    print(file)
    logArr = getFileContents(f'{basePath}/{file}')
    allObtained, allAccessed, files = getTweetAttainment(logArr)
#     print(tweets, files)
    totalTransportTweets.extend(allObtained)
    totalTweetsAccessed.extend(allAccessed)
    totalFiles.extend(files)

collect-05142020-05172020.log
collect-08172020-08202020.log
collect-11182021-11212021.log
collect-04062020-04122020.log
collect-03282022-04032022.log
collect-10042021-10102021.log
collect-09212020-09232020.log
collect-10082020-10112020.log
collect-08122021-08152021.log
collect-03102022-03132022.log
collect-06072021-06132021.log
collect-02142022-02202022.log
collect-02032022-01062022.log
collect-03142022-03202022.log
collect-12272022-01022022.log
collect-09022021-09052021.log
collect-06102021-06132021.log
collect-10212021-10242021.log
collect-12232021-12262021.log
collect-07052021-07112021.log
collect-04232020-04262020.log
collect-10052020-10072020.log
collect-12062021-12122021.log
collect-08302021-09052021.log
collect-07302020-08012020.log
collect-12202021-12262021.log
collect-07292021-08012021.log
collect-03172022-03202022.log
collect-02172022-02202022.log
collect-09102020-09132020.log
collect-06242021-06272021.log
collect-03252020-03272020.log
collect-09072020-09092020.log
collect-03

In [5]:
df = pd.DataFrame(list(zip(totalFiles, totalTransportTweets, totalTweetsAccessed)),
                    columns =['File', 'Tweets Obtained', 'Tweets Accessed'])
df

Unnamed: 0,File,Tweets Obtained,Tweets Accessed
0,covid-mobility-tweet-starting-2020-5-14~00:54:...,2,49
1,covid-mobility-tweet-starting-2020-5-14~01:18:...,1,43
2,covid-mobility-tweet-starting-2020-5-14~03:08:...,1,62
3,covid-mobility-tweet-starting-2020-5-14~04:25:...,1,54
4,covid-mobility-tweet-starting-2020-5-14~05:32:...,2,56
...,...,...,...
1808,covid-mobility-tweet-starting-2020-6-19~02:16:...,7,270
1809,covid-mobility-tweet-starting-2020-6-19~03:09:...,3,241
1810,covid-mobility-tweet-starting-2020-6-19~04:10:...,2,219
1811,covid-mobility-tweet-starting-2020-6-19~05:05:...,5,247


In [6]:
def getCOVIDURL(file):
    baseURL = 'https://raw.githubusercontent.com/echen102/COVID-19-TweetIDs/master'
    fileArr = file.split('-')
    YYYY, MM, end = fileArr[4], fileArr[5], fileArr[6]
    dd, end = end.split('~')
    hh = end.split(':')[0]
    
    MM_0 = MM if int(MM) > 9 else f'0{MM}'
    dd_0 = dd if int(dd) > 9 else f'0{dd}'

    return f'{baseURL}/{YYYY}-{MM_0}/coronavirus-tweet-id-{YYYY}-{MM_0}-{dd_0}-{hh}.txt'
    

print(df['File'][20])

covid-mobility-tweet-starting-2020-5-15~03:50:03.jsonl


In [7]:
test = 'https://raw.githubusercontent.com/echen102/COVID-19-TweetIDs/master/2022-03/coronavirus-tweet-id-2022-03-01-00.txt'

getCOVIDURL(df['File'][20])

'https://raw.githubusercontent.com/echen102/COVID-19-TweetIDs/master/2020-05/coronavirus-tweet-id-2020-05-15-03.txt'

In [8]:
df['COVIDURL'] = df['File'].apply(getCOVIDURL)

In [10]:
def getNumberOfTweetIDsFrom(txtFile):
    '''
    Retrieves length of tweet IDs from the text file's URL
    '''
    tweetList = []
    response = requests.get(txtFile) # requests raw file from Github
    
    if response.status_code == 200:
        content = response.content.decode('utf-8')
        tweetList = content.split()
        
    return len(tweetList)

df['Total Tweets'] = df['COVIDURL'].apply(getNumberOfTweetIDsFrom)

In [23]:
df['Tweets Accessed'] = df['Tweets Accessed'].astype(int)
df['Tweets Obtained'] = df['Tweets Obtained'].astype(int)

In [24]:
df['% Tweets Accessed'] = df['Tweets Accessed']/df['Total Tweets'] * 100
df['% Tweets Obtained from Accessible'] = df['Tweets Obtained']/df['Tweets Accessed'] * 100

In [25]:
df

Unnamed: 0,File,Tweets Obtained,Tweets Accessed,COVIDURL,Total Tweets,% Tweets Accessed,% Tweets Obtained from Accessible
0,covid-mobility-tweet-starting-2020-5-14~00:54:...,2,49,https://raw.githubusercontent.com/echen102/COV...,45851,0.106868,4.081633
1,covid-mobility-tweet-starting-2020-5-14~01:18:...,1,43,https://raw.githubusercontent.com/echen102/COV...,37560,0.114483,2.325581
2,covid-mobility-tweet-starting-2020-5-14~03:08:...,1,62,https://raw.githubusercontent.com/echen102/COV...,38373,0.161572,1.612903
3,covid-mobility-tweet-starting-2020-5-14~04:25:...,1,54,https://raw.githubusercontent.com/echen102/COV...,42411,0.127325,1.851852
4,covid-mobility-tweet-starting-2020-5-14~05:32:...,2,56,https://raw.githubusercontent.com/echen102/COV...,48382,0.115746,3.571429
...,...,...,...,...,...,...,...
1808,covid-mobility-tweet-starting-2020-6-19~02:16:...,7,270,https://raw.githubusercontent.com/echen102/COV...,179841,0.150133,2.592593
1809,covid-mobility-tweet-starting-2020-6-19~03:09:...,3,241,https://raw.githubusercontent.com/echen102/COV...,178864,0.134739,1.244813
1810,covid-mobility-tweet-starting-2020-6-19~04:10:...,2,219,https://raw.githubusercontent.com/echen102/COV...,175824,0.124556,0.913242
1811,covid-mobility-tweet-starting-2020-6-19~05:05:...,5,247,https://raw.githubusercontent.com/echen102/COV...,171328,0.144168,2.024291


In [26]:
df.describe()

Unnamed: 0,Tweets Obtained,Tweets Accessed,Total Tweets,% Tweets Accessed,% Tweets Obtained from Accessible
count,1813.0,1813.0,1813.0,1813.0,1813.0
mean,3.971318,183.114175,97346.168229,0.187658,2.338414
std,3.152709,103.150101,45943.229419,0.066136,1.407694
min,1.0,17.0,12254.0,0.043435,0.277778
25%,2.0,102.0,56530.0,0.144069,1.363636
50%,3.0,170.0,87664.0,0.169059,2.057613
75%,5.0,242.0,132231.0,0.212153,2.941176
max,31.0,767.0,180357.0,0.58096,11.646586
