# Data collection - Wikipeople

In [1]:
from wikipeople import wikipeople as wp
import pandas as pd
import json
import re

## Retrieve the data on Wikipeople

In [2]:
def get_guest_info(guest):
    wiki_info = {}
    search_results = None
    try:
      search_results = wp.search_wikidata(guest)
    except Exception:
      wiki_info['gender'] = None
      wiki_info['career'] = None
      wiki_info['birth'] = None
      wiki_info['country'] = None
    
    if search_results:
        wikidata_id = search_results
        wiki_info['gender'] = wp.get_property(wikidata_id, 'P21')[1]
        wiki_info['career'] = wp.get_property(wikidata_id, 'P106')[1]
        wiki_info['birth'] = wp.get_date_of_birth(wikidata_id)
        wiki_info['country'] = wp.get_property(wikidata_id, 'P27')[1]

    
    return wiki_info

In [3]:
df = pd.read_csv("Guests.csv", index_col='Unnamed: 0')

In [4]:
def clean_guest_name(name_list):
    clean_list = []
    for name in name_list:
        # Only keep the text before the first comma
        name = name.split(',')[0].strip()
        # Remove specific prefixes
        prefixes = ["Dr ", "Rt Hon ", "Rt Hon. ", "Sir ", "Prof ", "Professor ", "Commander ", "Commissioner ", "Desert Island Discs"]
        for prefix in prefixes:
            name = name.replace(prefix, "")
        # Remove numbers
        name = re.sub(r'\d+', '', name)
        # Replace "-" with " "
        name = name.replace("-", " ")
        # Remove "(" and ")"
        name = name.replace("(", "").replace(")", "").replace("/", "")
        # Trim leading and trailing whitespace
        name = name.strip()
        
        clean_list.append(name)
    
    return clean_list


In [5]:
df['name'] = clean_guest_name(df['guests'])
df

Unnamed: 0,time,duration,book,luxury,favourite,availibility,number,guests,links,index,year,name
0,2023-06-04,42 minutes,,,,False,0,04/06/2023,https://www.bbc.co.uk/programmes/m001mly1,0,2023,
1,2023-05-28,35 minutes,,,,False,0,"Ronnie O'Sullivan, snooker player",https://www.bbc.co.uk/programmes/m001mc30,1,2023,Ronnie O'Sullivan
2,2023-05-26,43 minutes,,,,False,8,Desert Island Discs - Professor Sharon Peacock...,https://www.bbc.co.uk/programmes/m001m4nx,2,2023,Sharon Peacock
3,2023-05-19,44 minutes,,,,False,8,"Simon Pegg, actor",https://www.bbc.co.uk/programmes/m001ly7j,3,2023,Simon Pegg
4,2023-05-12,37 minutes,,,,False,8,"Professor Peter Hennessy, historian",https://www.bbc.co.uk/programmes/m001lr3k,4,2023,Peter Hennessy
...,...,...,...,...,...,...,...,...,...,...,...,...
3355,1942-02-26,30 minutes,,,,False,7,Pat Kirkwood,https://www.bbc.co.uk/programmes/p009y0n8,3355,1942,Pat Kirkwood
3356,1942-02-19,30 minutes,,,,False,8,C B Cochran,https://www.bbc.co.uk/programmes/p009y0nd,3356,1942,C B Cochran
3357,1942-02-12,30 minutes,,,,False,8,Commander Campbell,https://www.bbc.co.uk/programmes/p009y0nh,3357,1942,Campbell
3358,1942-02-05,30 minutes,,,,False,7,James Agate,https://www.bbc.co.uk/programmes/p009y0nl,3358,1942,James Agate


In [None]:
data_generator = ((row['name'], row['index']) for _, row in df.iloc[:5, :].iterrows())

with open('wiki_data.json', 'w') as file:
    for i, (guest, index) in enumerate(data_generator):
        print(f"Processing {guest}")

        data = {'guests': guest, 'index': index}
        info = get_guest_info(guest)
        data.update(info)
        
        json_data = json.dumps(data)
        file.write(json_data)
        file.write('\n')
        print(f"Data {index} written to file")

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
Processing Carl Djerassi
Data 860 written to file
Processing Paul Gambaccini
Data 861 written to file
Processing Philip Pullman
Data 862 written to file
Processing Dame Alicia Markova
Data 863 written to file
Processing Timothy Spall
Data 864 written to file
Processing Brian May
Data 865 written to file
Processing Alan Titchmarsh
Data 866 written to file
Processing Minette Walters
Data 867 written to file
Processing Countess Elizabeth Longford
Data 868 written to file
Processing Jan Morris
Data 869 written to file
Processing Leonard Rosoman
Data 870 written to file
Processing Sue Johnston
Data 871 written to file
Processing Suggs
Data 872 written to file
Processing Aaron Klug
Data 873 written to file
Processing Jude Kelly
Data 874 written to file
Processing Betty Jackson
Data 875 written to file
Processing Wayne Marshall
Data 876 written to file
Processing Christopher Bland
Data 877 written to file
Processing Fiona Reynolds
Data 878 written to f

## Overview of the Wikipeople data

In [6]:
df_update = pd.read_json('wiki_data.json', lines = True)
df_update

Unnamed: 0,guests,index,gender,career,birth,country
0,,0,,,,
1,Ronnie O'Sullivan,1,male,pool player,+1975-12-05,United Kingdom
2,Sharon Peacock,2,female,microbiologist,+1959-03-24,
3,Simon Pegg,3,male,actor,+1970-02-14,United Kingdom
4,Peter Hennessy,4,male,politician,+1947-03-28,United Kingdom
...,...,...,...,...,...,...
3355,Pat Kirkwood,3355,male,engineer,+1927-12-22,United States of America
3356,C B Cochran,3356,,,,
3357,Campbell,3357,,,,
3358,James Agate,3358,male,journalist,+1877-09-09,United Kingdom


In [7]:
bbc_wiki = pd.merge(df, df_update.drop('guests', axis=1), on=["index"])

In [8]:
bbc_wiki

Unnamed: 0,time,duration,book,luxury,favourite,availibility,number,guests,links,index,year,name,gender,career,birth,country
0,2023-06-04,42 minutes,,,,False,0,04/06/2023,https://www.bbc.co.uk/programmes/m001mly1,0,2023,,,,,
1,2023-05-28,35 minutes,,,,False,0,"Ronnie O'Sullivan, snooker player",https://www.bbc.co.uk/programmes/m001mc30,1,2023,Ronnie O'Sullivan,male,pool player,+1975-12-05,United Kingdom
2,2023-05-26,43 minutes,,,,False,8,Desert Island Discs - Professor Sharon Peacock...,https://www.bbc.co.uk/programmes/m001m4nx,2,2023,Sharon Peacock,female,microbiologist,+1959-03-24,
3,2023-05-19,44 minutes,,,,False,8,"Simon Pegg, actor",https://www.bbc.co.uk/programmes/m001ly7j,3,2023,Simon Pegg,male,actor,+1970-02-14,United Kingdom
4,2023-05-12,37 minutes,,,,False,8,"Professor Peter Hennessy, historian",https://www.bbc.co.uk/programmes/m001lr3k,4,2023,Peter Hennessy,male,politician,+1947-03-28,United Kingdom
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3355,1942-02-26,30 minutes,,,,False,7,Pat Kirkwood,https://www.bbc.co.uk/programmes/p009y0n8,3355,1942,Pat Kirkwood,male,engineer,+1927-12-22,United States of America
3356,1942-02-19,30 minutes,,,,False,8,C B Cochran,https://www.bbc.co.uk/programmes/p009y0nd,3356,1942,C B Cochran,,,,
3357,1942-02-12,30 minutes,,,,False,8,Commander Campbell,https://www.bbc.co.uk/programmes/p009y0nh,3357,1942,Campbell,,,,
3358,1942-02-05,30 minutes,,,,False,7,James Agate,https://www.bbc.co.uk/programmes/p009y0nl,3358,1942,James Agate,male,journalist,+1877-09-09,United Kingdom


In [9]:
# Remove the first case because this episode hasn't been broadcast
bbc_wiki = bbc_wiki.iloc[1: , : ]

In [10]:
bbc_wiki.to_csv("bbc_wiki.csv")