In [61]:
import time
import datetime as dt
import pandas as pd
import json
from selenium.webdriver import Chrome
from selenium.webdriver.common.action_chains import ActionChains

In [105]:
browser = Chrome()
url = 'https://web.archive.org/web/20190617005258/https://old.reddit.com/r/worldnews/'
browser.get(url)

In [9]:
headline_elements = browser.find_elements_by_css_selector('p.title > a')[1:]
vote_elements = browser.find_elements_by_css_selector('div.midcol.unvoted > div.score.unvoted')[1:]
comment_elements = browser.find_elements_by_css_selector('li.first > a')
time_elements = browser.find_elements_by_css_selector('time')[1:]

In [45]:
headlines = [headline.text for headline in headline_elements]
votes = [vote.get_attribute('title') for vote in vote_elements]
comments = [comment.text for comment in comment_elements]
ages = [time.text for time in time_elements]
datetimes = [time.get_attribute('datetime') for time in time_elements]     
datetimes = [text.replace('T', ' ') for text in datetimes]     # Removing 'T' in date time
datetimes = [text[:-6] for text in datetimes]       # Removing trailing milliseconds
datetimes = [datetime.datetime.strptime(text, '%Y-%m-%d %H:%M:%S') 
                                 for text in datetimes]   #  coerce to dt object

In [46]:
pd.DataFrame([headlines, votes, comments, ages, datetimes]).T

Unnamed: 0,0,1,2,3,4
0,Porn Sites in Hong Kong Shut Down to Encourage...,48936,1359 comments,4 hours ago,2019-06-16 20:01:38
1,Massive power cut hits all of Argentina and Ur...,25546,1183 comments,13 hours ago,2019-06-16 11:19:45
2,The UK has now committed to the most aggressiv...,8674,514 comments,12 hours ago,2019-06-16 12:44:06
3,Japan demands more proof from U.S. that Iran a...,19052,1876 comments,15 hours ago,2019-06-16 09:02:32
4,Hongkongers march in their thousands against e...,40652,1668 comments,16 hours ago,2019-06-16 08:01:32
5,Hong Kong protesters coordinate tech-savvy eff...,2685,163 comments,11 hours ago,2019-06-16 13:42:30
6,Boris Johnson failed to protect biodiversity h...,3797,181 comments,13 hours ago,2019-06-16 11:39:06
7,Many fliers say they will avoid Boeing’s 737 M...,1990,583 comments,10 hours ago,2019-06-16 14:10:14
8,Nearly 43% of the new members of the India's l...,1588,70 comments,10 hours ago,2019-06-16 13:55:22
9,Boeing CEO concedes 'mistake' with planes in 2...,958,140 comments,8 hours ago,2019-06-16 16:36:28


### This is promising, but lets go ahead and transform date time to 'day of week' and 'hour of day'

In [47]:
type(datetimes[0])

datetime.datetime

In [52]:
test_dummy = datetimes[0]

In [58]:
print(datetimes[0])
datetimes[0].weekday()    # June 16th is sunday, wkday nums appear to be 0-6, mon-sun

2019-06-16 20:01:38


6

In [59]:
days_of_week = [datetime.weekday() for datetime in datetimes]
hours_posted = [datetime.hour for datetime in datetimes]

In [60]:
pd.DataFrame([days_of_week, hours_posted]).T

Unnamed: 0,0,1
0,6,20
1,6,11
2,6,12
3,6,9
4,6,8
5,6,13
6,6,11
7,6,14
8,6,13
9,6,16


# SUCCESS - Successfully extracted weekday and hour of post

In [83]:
temp_json_dummy_list = []
for headline, vote, comment, age, weekday, hour in zip(
                                            headlines, votes, comments,
                                            ages, days_of_week, hours_posted):
    temp_json_dummy_list.append({'text':headline,
           'score': vote,
           'comment_count': comment,
           'age': age,
           'weekday': weekday,
           'hour_posted': hour})

In [84]:
len(temp_json_dummy_list)

25

In [82]:
for line in temp_json_dummy_list[:5]:
    print(line, '\n')

{'text': 'Porn Sites in Hong Kong Shut Down to Encourage People to Protest', 'score': '48936', 'comment_count': '1359 comments', 'age': '4 hours ago', 'weekday': 6, 'hour_posted': 20} 

{'text': 'Massive power cut hits all of Argentina and Uruguay - 50 million people without electricity', 'score': '25546', 'comment_count': '1183 comments', 'age': '13 hours ago', 'weekday': 6, 'hour_posted': 11} 

{'text': 'The UK has now committed to the most aggressive climate target in the world: The country also went two whole weeks without burning coal for electricity.', 'score': '8674', 'comment_count': '514 comments', 'age': '12 hours ago', 'weekday': 6, 'hour_posted': 12} 

{'text': 'Japan demands more proof from U.S. that Iran attacked tankers', 'score': '19052', 'comment_count': '1876 comments', 'age': '15 hours ago', 'weekday': 6, 'hour_posted': 9} 

{'text': 'Hongkongers march in their thousands against extradition bill again, calling for leader to resign', 'score': '40652', 'comment_count':

In [100]:
with open('test.json', 'w') as f:
    json.dump(temp_json_dummy_list[0], fp=f)

In [101]:
!head test.json

{"text": "Porn Sites in Hong Kong Shut Down to Encourage People to Protest", "score": "48936", "comment_count": "1359 comments", "age": "4 hours ago", "weekday": 6, "hour_posted": 20}

In [102]:
with open('test.json', 'a') as f:
    for line in temp_json_dummy_list:
        json.dump(line, f)

In [104]:
!head test.json

{"text": "Porn Sites in Hong Kong Shut Down to Encourage People to Protest", "score": "48936", "comment_count": "1359 comments", "age": "4 hours ago", "weekday": 6, "hour_posted": 20}{"text": "Porn Sites in Hong Kong Shut Down to Encourage People to Protest", "score": "48936", "comment_count": "1359 comments", "age": "4 hours ago", "weekday": 6, "hour_posted": 20}{"text": "Massive power cut hits all of Argentina and Uruguay - 50 million people without electricity", "score": "25546", "comment_count": "1183 comments", "age": "13 hours ago", "weekday": 6, "hour_posted": 11}{"text": "The UK has now committed to the most aggressive climate target in the world: The country also went two whole weeks without burning coal for electricity.", "score": "8674", "comment_count": "514 comments", "age": "12 hours ago", "weekday": 6, "hour_posted": 12}{"text": "Japan demands more proof from U.S. that Iran attacked tankers", "score": "19052", "comment_count": "1876 comments", "age": "15 hours ago", "wee

In [141]:
# pd.read_json(open('test.json', 'r', encoding='utf8'),
#              lines=True)      # Chucks an error ValueError: Unexpected character found when decoding array value (2)

# STALLED OUT LETS TRY CSV

In [146]:
temp_csv = ['text, score, comment_count, age, day_of_week, hour_posted,\n']
for headline, vote, comment, age, weekday, hour in zip(
                                            headlines, votes, comments,
                                            ages, days_of_week, hours_posted):
    temp_csv.append(f"""{headline}, {vote}, {comment}, {age}, {weekday}, {hour}\n""")

In [149]:
with open('test.csv', 'w') as f:
    for line in temp_csv:
        f.write(line)

In [151]:
!vd test.csv

[23B[m[37m[40m[1mtest| saul.pw/VisiData v1.5.2 | opening test as csv | Ctrl+H [m[37m[40m [m[37m[40m[1m         0   0%[m[32m[40m [m[37m[40m[1m [24;63H[m[m[37m[40m[H[C[m[37m[40m[1m[7m[4mtext[1;21H[m[34m[40m|[m[37m[40m[1m[4m score              [m[34m[40m|[m[37m[40m[1m[4m comment_count      [m[34m[40m|[m[37m[40m[1m[4m age           [m[34m[40m>
[m[37m[40m[1m[7m Porn Sites in Hong…[m[m[37m[40m[7m| 48936              | 1359 comments      | 4 hours ago    
[m[m[37m[40m[1m Massive power cut …[m[m[37m[40m| 25546[3;42H| 1183 comments[6C| 13 hours ago
[m[37m[40m[1m The UK has now com…[m[m[37m[40m| 8674[4;42H| 514 comments[7C| 12 hours ago
[m[37m[40m[1m Japan demands more…[m[m[37m[40m| 19052[5;42H| 1876 comments[6C| 15 hours ago
[m[37m[40m[1m Hongkongers march …[m[m[37m[40m| calling for leader…| 40652[6;63H| 1668 comments
[m[37m[40m[1m Hong Kong proteste…[m[m[37m[40m| 2685[7;42H| 1

### Appears to have got caught up on commas in the text, lets try tsv

In [147]:
temp_tsv = ['text\tscore\tcomment_count\tage\tday_of_week\thour_posted\t\n']
for headline, vote, comment, age, weekday, hour in zip(
                                            headlines, votes, comments,
                                            ages, days_of_week, hours_posted):
    temp_tsv.append(f"""{headline}\t{vote}\t{comment}\t{age}\t{weekday}\t{hour}\n""")

In [148]:
for line in temp_tsv:
    print(line)

text	score	comment_count	age	day_of_week	hour_posted	

Porn Sites in Hong Kong Shut Down to Encourage People to Protest	48936	1359 comments	4 hours ago	6	20

Massive power cut hits all of Argentina and Uruguay - 50 million people without electricity	25546	1183 comments	13 hours ago	6	11

The UK has now committed to the most aggressive climate target in the world: The country also went two whole weeks without burning coal for electricity.	8674	514 comments	12 hours ago	6	12

Japan demands more proof from U.S. that Iran attacked tankers	19052	1876 comments	15 hours ago	6	9

Hongkongers march in their thousands against extradition bill again, calling for leader to resign	40652	1668 comments	16 hours ago	6	8

Hong Kong protesters coordinate tech-savvy effort to beat Chinese state surveillance	2685	163 comments	11 hours ago	6	13

Boris Johnson failed to protect biodiversity hotspot, says UN expert - “Unfortunately, I wasn’t able to get Boris to protect Georgia and the Sandwich Islands. Just

In [144]:
with open('test.tsv', 'w') as f:
    for line in temp_tsv:
        f.write(line)

In [152]:
!vd test.tsv

[23B[m[37m[40m[1mtest| saul.pw/VisiData v1.5.2 | opening test as tsv | Ctrl+H [m[37m[40m [m[37m[40m[1m         0   0%[m[32m[40m [m[37m[40m[1m…[24;63H[m[m[37m[40m[H[C[m[37m[40m[1m[7m[4mtext[1;21H[m[34m[40m|[m[37m[40m[1m[4m score   [m[34m[40m|[m[37m[40m[1m[4m comment_count   [m[34m[40m|[m[37m[40m[1m[4m age            [m[34m[40m|[m[37m[40m[1m[4m day_of_week[m[34m[40m>
[m[37m[40m[1m[7m Porn Sites in Hong…[m[m[37m[40m[7m| 48936   | 1359 comments   | 4 hours ago    | 6           
[m[m[37m[40m[1m Massive power cut …[m[m[37m[40m| 25546   | 1183 comments   | 13 hours ago   | 6
[m[37m[40m[1m The UK has now com…[m[m[37m[40m| 8674    | 514 comments    | 12 hours ago   | 6
[m[37m[40m[1m Japan demands more…[m[m[37m[40m| 19052   | 1876 comments   | 15 hours ago   | 6
[m[37m[40m[1m Hongkongers march …[m[m[37m[40m| 40652   | 1668 comments   | 16 hours ago   | 6
[m[37m[40m[1m Hong Kong prot