In [1]:
# extract packages

import pandas as pd
pd.options.display.max_rows = 999
from bs4 import BeautifulSoup
import requests
import re

In [6]:
# put together everything from before

def scrape_events(events_page_url):
    #Your code here
    page = requests.get(events_page_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # set up the container
    container = soup.find(id = 'items')
    
    # set up each row of the container, we will be iterating over each row of the container
    rows = container.findAll('li') # each row is identifiable by its tag 'li'
    
    # traverse the container; searching by h1 heading should narrow things down
    eventdata = container.findAll('h1', class_ ='event-title') # eventdata contains both name and venue
    
    # write a loop that iterates through the container for a list of names that will be used in the dataframe (final_name_list)
    final_name_list = [ ]

    #set up a list that contains the list of names
    #this list will be used by the for loop to populate the final_name_list 
    #ONLY where an event-title exists.
    name_list = [ ]

    # this loop is used to populate a list of event names for use in name_list
    for n in range(len(eventdata)):
        name = eventdata[n].find('a').text
        name_list.append(name)

    # this loop is used to check if there is an event name in a row, if not, return NaN
    # if there is an event name, it takes the actual name of the event from name_list and uses it to populate final_name_list
    # final_name_list goes into final dataframe

    name_list_counter = 0

    for n in range(len(rows)):
    
        if rows[n].find(class_ = 'event-title') == None:
            final_name_list.append('NaN')
            
        else:
            name = rows[n].findAll('a')[1].text
            final_name_list.append(name_list[name_list_counter])
            name_list_counter += 1
   
    # set up a list of venues actually found in the container that will be used to populate final_venue_list
    venue_list = [ ]

    # set up an empty list of final_venue_list which will be used to construct the dataframe
    final_venue_list = [ ]

    # this loop is used to populate the event names actually found in the container that will, in turn, be used to
    # populate the final_venue_list
    for v in range(len(eventdata)):
        venue = eventdata[v].find('span').text.strip('at ')
        venue_list.append(venue)

    # this loop is used to populate the final_venue_list used in the dataframe, it will search each row of the container
    # where it does not find a venue name in a row, it will return NaN and populate final_venue_list accordingly
    # where it does find a venue name, it will take a name from the venue_list and populate final_venue_list accordingly.

    venue_list_counter = 0

    for v in range(len(rows)):
    
        if rows[v].find(class_ = 'event-title') == None:
            final_venue_list.append('NaN')
        else:
            final_venue_list.append(venue_list[venue_list_counter])
            venue_list_counter += 1 
    
    # write a loop that iterates through the container for the date of the event to generate a list
    # NOTE: certain dates have more events than others
    final_date_list = [ ]

    for d in range(len(rows)):
        if rows[d].find(class_ = 'eventDate date') == None:
            final_date_list.append(date)
        else:
            date = rows[d].find('a').text.strip(' /')
            final_date_list.append(date) 
    
    #set up an empty list, populating with number of attendees specified by the page.
    attend_list = [ ]

    #set up a final_attend_list to be used to construct the dataframe, this is populated with
    #the figures from attend_list where available and populated with NaN if not available.
    final_attend_list = [ ]
    
    # This narrows down into listings that do have attendees
    attend = container.findAll('p', class_ = 'attending')

    # this loop populates attend_list and creates a list of number of attendees where they have been specified
    for a in range(len(attend)):
        a_num = int(attend[a].text.strip('Attending '))
        attend_list.append(a_num)

    # this loop populates the final_attend_list such with numbers from attend_list where available, otherwise for listings
    # that do not specify any number of attendees, NaN is returned.

    attend_list_counter = 0

    for a in range(len(rows)):
        
        if rows[a].find(class_ = 'attending') == None:
            final_attend_list.append('NaN')
            
        else:
            final_attend_list.append(attend_list[attend_list_counter])
            attend_list_counter +=1
    
    final_lists = [final_name_list, final_venue_list, final_date_list, final_attend_list]
    
    # construct the dataframe
    df = pd.DataFrame(final_lists).transpose()
    df.columns = ["Event_Name", "Venue", "Event_Date", "Number_of_Attendees"]

    #columns where the Event Name is NaN can be removed.
    df = df.loc[df['Event_Name'] != 'NaN']

    # this resets the index and drops the old index which doesn't work anymore now that the NaNs have been removed.
    df = df.reset_index(drop = True)
    
    return df

In [7]:
scrape_events('https://www.residentadvisor.net/events')

Unnamed: 0,Event_Name,Venue,Event_Date,Number_of_Attendees
0,Music Sans Frontiers,The Social,"Mon, 18 May 2020",1.0
1,Shy b2b Cat All Night Long,TBA - London,"Tue, 19 May 2020",1.0
2,Yves Tumor & Its Band,Electric Brixton,"Wed, 20 May 2020",25.0
3,Despacio,The Roundhouse,"Thu, 21 May 2020",17.0
4,Techno RE-Imagine:06,Canavan's Peckham Pool Club,"Thu, 21 May 2020",2.0
5,[POSTPONED] - Dispatch Recordings,E1 London,"Fri, 22 May 2020",99.0
6,"Human Traffic Live, Opening Night with Pete Tong",Printworks,"Fri, 22 May 2020",24.0
7,Katermukke Showcase: London,Night Tales,"Fri, 22 May 2020",8.0
8,The Fleetwood Mac Summer Rooftop Party 2020,The Prince of Wales,"Fri, 22 May 2020",2.0
9,All Points East 2020,Victoria Park,"Fri, 22 May 2020",96.0
