In [117]:
import requests
from pprint import pprint
import pickle
from tqdm import tqdm
import re

In [175]:
def get_info(name = "Kanye West"):
    info = {}

    url = 'https://query.wikidata.org/sparql'
    query = """
        SELECT DISTINCT ?item ?birthLocation ?birthLocationLabel ?Twitter_username ?Instagram_username ?occupation ?occupationLabel ?date_of_birth ?sex_or_gender ?sex_or_genderLabel ?country_of_citizenship ?country_of_citizenshipLabel ?image 
        WHERE {
            """ + f'?item rdfs:label "{name}"@en ;' + """ 
            SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }

            OPTIONAL { ?item wdt:P19   ?birthLocation. }
            OPTIONAL { ?item wdt:P2002 ?Twitter_username. }
            OPTIONAL { ?item wdt:P2003 ?Instagram_username. }
            OPTIONAL { ?item wdt:P106  ?occupation. }
            OPTIONAL { ?item wdt:P569  ?date_of_birth. }
            OPTIONAL { ?item wdt:P21   ?sex_or_gender. }
            OPTIONAL { ?item wdt:P27   ?country_of_citizenship. }
            OPTIONAL { ?item wdt:P18   ?image. }
        }
    """
    r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()  

    if len(data['results']['bindings']) == 0:
        raise Exception(f"No results for {name}")

    b = data['results']['bindings']
    person = set([x['item']['value'] for x in b])

    if len(person) > 1:
        raise Exception(f"Multiple person results for {name}")

    optionals = ['Twitter_username', 'Instagram_username', 'occupation', 'date_of_birth', 'sex_or_gender', 'country_of_citizenship', 'image', 'birthLocation', 'birthLocationLabel']
    info = {}
    for opt in optionals:
        all_values = list(set([res.get(opt)['value'] for res in b if opt in res]))
        info[opt] = all_values

    info['personUri'] = list(person)[0]
    info['person'] = name

    return info

In [66]:
CACHE = "./jre-episodes.pickle"
with open(CACHE, "rb") as f:
    episodes = pickle.load(f)

In [111]:
names = []
for ep in episodes:
    # name = re.findall(r"- ([A-Z].*)", ep.title)
    name = re.findall(r"#\d\d?\d?\d? ?-?-? .*?([A-Z].*)", ep.title)
    skip_best = "Best of " in ep.title
    skip_toon = "JRE Toon" in ep.title
    skip_fight_companion = "Fight Companion" in ep.title
    skip_mma = "JRE MMA Show" in ep.title
    skip_from_jre = "from Joe Rogan" in ep.title or "from JRE" in ep.title
    skip_questions = "Questions Everything" in ep.title
    if skip_best or skip_toon or skip_fight_companion or skip_from_jre or skip_mma or skip_questions:
        # print(ep.title)
        # "from Joe Rogan Experience" not in ep.title
        continue
    elif len(name) == 0:
        # NOT CAPTURED
        # print(ep.title)
        continue
    else:
        name = name[0]
        if not ep.number:
            continue
        
        # if "Part 2" in name:
        #     print(name)
        # else:
        #     # print(ep.title)
        names.append((ep, name))

nums = [e.number for e, n in names]
for i in range(1539):
    if i+1 not in nums:
        print("missing", i+1)

missing 4
missing 8
missing 10
missing 22
missing 23
missing 32
missing 36
missing 41
missing 42
missing 43
missing 48
missing 49
missing 50
missing 51
missing 52
missing 56
missing 57
missing 60
missing 61
missing 62
missing 63
missing 64
missing 65
missing 66
missing 67
missing 68
missing 69
missing 70
missing 78
missing 81
missing 85
missing 86
missing 92
missing 97
missing 99
missing 104
missing 108
missing 120
missing 128
missing 149
missing 160
missing 164
missing 168
missing 169
missing 174
missing 182
missing 183
missing 191
missing 195
missing 198
missing 210
missing 213
missing 223
missing 225
missing 229
missing 230
missing 233
missing 238
missing 242
missing 243
missing 247
missing 248
missing 257
missing 265
missing 287
missing 294
missing 296
missing 302
missing 303
missing 306
missing 307
missing 309
missing 311
missing 313
missing 315
missing 318
missing 320
missing 326
missing 327
missing 341
missing 342
missing 343
missing 353
missing 364
missing 370
missing 378
missi

In [177]:
guest_data = []

In [196]:
import time
errors = []

# Super hacky but populate guest data 
# with some basic info like ig and twitter username, and the eps they were in

for ep, name_ in tqdm(names):
    # print(ep.description)
    names2 = [name_]
    if "," in name_:
        names2 = name_.split(",")
    elif "&" in name_:
        names2 = name_.split("&")

    for name in names2:
        hits = [g for g in guest_data if g['person'].lower() == name.lower()]
        if len(hits) > 0:
            if hits[0].get('episodes') is None:
                hits[0]['episodes'] = []
            hits[0]['episodes'] = hits[0].get('episodes', []).append(ep.video_id)
            continue
        try:
            time.sleep(.75)
            info = get_info(str(name).strip())
            info['episodes'] = [ep.video_id]
            guest_data.append(info)
        except Exception as e:
            print(name, e)
            errors.append((name, e))

|███████▎  | 1053/1438 [11:02<05:18,  1.21it/s]David Seaman Multiple person results for David Seaman
Brendan Schaub  Expecting value: line 1 column 1 (char 0)
 74%|███████▎  | 1057/1438 [11:05<05:52,  1.08it/s]Mark Kendall Multiple person results for Mark Kendall
Rob MacCachren  Expecting value: line 1 column 1 (char 0)
 74%|███████▍  | 1061/1438 [11:09<06:33,  1.04s/it] Josh Wickerham Expecting value: line 1 column 1 (char 0)
 74%|███████▍  | 1062/1438 [11:10<06:34,  1.05s/it]Honey Honey Multiple person results for Honey Honey
 74%|███████▍  | 1065/1438 [11:11<05:17,  1.18it/s]Jim Jefferies Multiple person results for Jim Jefferies
 74%|███████▍  | 1066/1438 [11:12<05:34,  1.11it/s]Shane Smith Multiple person results for Shane Smith
 74%|███████▍  | 1068/1438 [11:13<04:41,  1.31it/s]Dr. Carl Hart Expecting value: line 1 column 1 (char 0)
 74%|███████▍  | 1069/1438 [11:14<05:17,  1.16it/s] Christopher Ryan Multiple person results for Christopher Ryan
 74%|███████▍  | 1070/1438 [11:15<0

In [198]:
import json
with open("../data/jre/guests.json", "w") as f:
    f.write(json.dumps(guest_data))

In [199]:
# len(set([n for e, n in names]))
# len(errors), len(guest_data)
errors[:100]

# get_info(str('Duncan Trussell'))

[('Douglas Murray', Exception('Multiple person results for Douglas Murray')),
 ('Edward Snowden', Exception('Multiple person results for Edward Snowden')),
 ('Tim Kennedy', Exception('Multiple person results for Tim Kennedy')),
 ('Ron White', Exception('Multiple person results for Ron White')),
 ('Adam Curry', Exception('Multiple person results for Adam Curry')),
 ('Mike Tyson', Exception('Multiple person results for Mike Tyson')),
 ('Ali Macofsky', Exception('No results for Ali Macofsky')),
 ('Tim Dillon', Exception('Multiple person results for Tim Dillon')),
 (' Brian Redban', Exception('No results for Brian Redban')),
 ('Rob Lowe', Exception('Multiple person results for Rob Lowe')),
 ('Josh Dubin ', Exception('No results for Josh Dubin')),
 ('Dr. Debra Soh', Exception('No results for Dr. Debra Soh')),
 ('Mike Baker', Exception('Multiple person results for Mike Baker')),
 ('Nancy Panza', Exception('No results for Nancy Panza')),
 ('Post Malone', Exception('Multiple person results for

In [166]:
x = {}
x.get('value', {'test':[]})['test']
x['a'] = set(['ets', 'st'])
x
# di = {'value':[]}

{'a': {'ets', 'st'}}