This repository has been archived by the owner on Dec 22, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrap.py
111 lines (87 loc) · 3.76 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from bs4 import BeautifulSoup
import requests
import re
import datetime
import json
import os
import validators
from dateutil import parser
from dateutil import tz
from datetime import datetime, timedelta
import pytz
from github import Github
from airtable import Airtable
def FindUrl(string):
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex,string)
return [x[0] for x in url]
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
def GetClubhouse(url):
print(url)
page = requests.get(url, headers=headers, allow_redirects=False)
print(page.status_code)
if page.status_code == 200:
soup = BeautifulSoup(page.content, "html.parser")
# title
title = soup.find("meta", property="og:title")["content"]
# event url
url = soup.find("meta", property="og:url")["content"]
# date
PACIFIC = tz.gettz('America/Los_Angeles')
to_zone = tz.gettz('Europe/Warsaw')
timezone_info = {"PST": PACIFIC, "PDT": PACIFIC}
date = soup.find('div', class_='ml-1')
date = re.sub(' +', ' ', date.text.replace('\n',''))
date = date.replace('(','')
date = date.replace(')','')
date = parser.parse(date, tzinfos=timezone_info)
date = date.astimezone(to_zone)
# speakers name
speakers = soup.find('div', class_='text-sm font-thin mt-2').find('em')
speakers = [x.strip() for x in speakers.text.replace('w/','').split(',')]
# avatars
results = soup.find_all('div', class_='px-1')
avatar_img_urls = FindUrl(str(results))
# description
description = str(soup.find('div', class_='text-sm font-thin mt-2').text).replace(str(soup.find('div', class_='text-sm font-thin mt-2').find('em').text),'').replace('—','').strip()
event = {"title": title, "url": url, "date" : date.isoformat(), "speakers" : speakers, "avatars" : avatar_img_urls, "description" : description}
return event
else:
return None
## main ;-)
urls = []
g = Github(os.getenv('GITHUB_TOKEN'))
repo = g.get_repo("kaluzaaa/clubhouse-calendar")
open_issues = repo.get_issues(state='open')
for issue in open_issues:
if validators.url(issue.title):
urls.append(issue.title)
airtable = Airtable(os.getenv('AT_BASE_ID'), 'Auditions', os.getenv('AT_API_KEY'))
for item in airtable.get_all(fields='URL'):
urls.append(item['fields']['URL'])
urls = list(dict.fromkeys(urls))
events = []
for url in urls:
event = GetClubhouse(url)
if event != None:
events.append(event)
with open('_data/events.json', 'w') as outfile:
json.dump(events, outfile, ensure_ascii=False, indent=2)
for event in events:
update = airtable.update_by_field('URL', event['url'], {'Hosts': ', '.join(event['speakers']), 'Description' : event['description'], 'Audition Name' : event['title'], 'Time' : event['date']})
if not update:
insert = airtable.insert({'URL' : event['url'], 'Hosts': ', '.join(event['speakers']), 'Description' : event['description'], 'Audition Name' : event['title'], 'Time' : event['date']})
print('Insert: ',insert)
else:
print('Update: ', update)
# close old issue
now = datetime.now(pytz.utc)
for issue in open_issues:
if validators.url(issue.title):
for event in events:
if event['url'] == issue.title:
if parser.parse(event['date']) + timedelta(hours=4) < now:
issue.edit(state='closed')