In [1]:
import requests
from requests.auth import HTTPDigestAuth
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Test fetching

In [17]:
# access API
url = "https://rails-prod.quivr.be/graphql"

# use authorisation through my account (TT)
headers = {'Authorization': 'Bearer eyJhY2Nlc3MtdG9rZW4iOiJieUJrb1o5VWNfZ2tDb2RINVZaOUtBIiwidG9rZW4tdHlwZSI6IkJlYXJlciIsImNsaWVudCI6Ii1XTFJoWVRUWWJzYkw5Ml9LTmJPdkEiLCJleHBpcnkiOiIyMDAwMDIwNzExIiwidWlkIjoidGFtYXMudHJvbWJpdGFzQGt1bGV1dmVuLmJlIn0=',
           'Access-Token': 'byBkoZ9Uc_gkCodH5VZ9KA',
}

# define event ID for test
body2 = {"operationName":"EventDetailed",
        "variables":{"id": 1707},
        # select info we need for filtering
        "query":"query EventDetailed($id: ID!) {eventSpecification(id: $id) {userData {id, title, startTime, endTime, url, tags {name}, locations {address, name, latitude, longitude}}}}"}
        
# use POST method to fetch data
response = requests.post(url, headers=headers, json=body2)
print(response.status_code)
print(response.text)

200
{"data":{"eventSpecification":null}}


# Final version of the scraper

In [None]:
# scraping on 21/05/2023 until event ID 1699
url = "https://rails-prod.quivr.be/graphql"
headers = {'Authorization': 'Bearer eyJhY2Nlc3MtdG9rZW4iOiJieUJrb1o5VWNfZ2tDb2RINVZaOUtBIiwidG9rZW4tdHlwZSI6IkJlYXJlciIsImNsaWVudCI6Ii1XTFJoWVRUWWJzYkw5Ml9LTmJPdkEiLCJleHBpcnkiOiIyMDAwMDIwNzExIiwidWlkIjoidGFtYXMudHJvbWJpdGFzQGt1bGV1dmVuLmJlIn0=',
           'Access-Token': 'byBkoZ9Uc_gkCodH5VZ9KA',
}

IDs = list(range(1, 1700))
results = pd.DataFrame()

for ID in IDs:
    body = {
        "operationName": "EventDetailed",
        "variables": {"id": ID},
        "query": "query EventDetailed($id: ID!) {eventSpecification(id: $id) {userData {id, title, startTime, endTime, url, tags {name}, locations {address, name, latitude, longitude}}}}"
    }
    
    response = requests.post(url, headers=headers, json=body)
    
    # error checks + parsing
    if response.status_code == 200:
        data = response.json()
        if data["data"]["eventSpecification"]!=None and data["data"]["eventSpecification"]["userData"]!=None:
            event_data = data["data"]["eventSpecification"]["userData"]
            
            # Extract the tags as a comma-separated string
            if len(data['data']['eventSpecification']['userData']['locations'])==0:
                tags = np.nan
            else:
                tags = ', '.join([tag['name'] for tag in event_data['tags']])
            # Extract the locations information
            locations = data['data']['eventSpecification']['userData']['locations']
            location_data = []
            # check if location is empty
            if len(locations)==0:
                print("empty locations")
                location_data.append({
                        'address': np.nan,
                        'name': np.nan,
                        'latitude': np.nan,
                        'longitude': np.nan
                    })
            else:
                for loc in locations:
                    location_data.append({
                        'address': loc['address'],
                        'name': loc['name'],
                        'latitude': loc['latitude'],
                        'longitude': loc['longitude']
                    })
            result = pd.DataFrame({
                'id': [data['data']['eventSpecification']['userData']['id']],
                'title': [data['data']['eventSpecification']['userData']['title']],
                'startTime': [data['data']['eventSpecification']['userData']['startTime']],
                'endTime': [data['data']['eventSpecification']['userData']['endTime']],
                'url': [data['data']['eventSpecification']['userData']['url']],
                'tags': [tags],
                'address': [loc['address'] for loc in location_data],
                'name': [loc['name'] for loc in location_data],
                'latitude': [loc['latitude'] for loc in location_data],
                'longitude': [loc['longitude'] for loc in location_data]
            })

            # concatenate results
            results = pd.concat([results, result], ignore_index=True)
            print(results.tail())
        else:
            print(f"Error fetching data for ID {ID}. EventSpecification=None")
    else:
        print(f"Error fetching data for ID {ID}. Status code: {response.status_code}")

# Print the DataFrame
print(results)
# save it to .csv
results.to_csv('raw_data_fetched.csv', sep='\t', index=False)


In [3]:
# scraping on 28/05/2023 until from event ID 1700 to 1705
url = "https://rails-prod.quivr.be/graphql"
headers = {'Authorization': 'Bearer eyJhY2Nlc3MtdG9rZW4iOiJieUJrb1o5VWNfZ2tDb2RINVZaOUtBIiwidG9rZW4tdHlwZSI6IkJlYXJlciIsImNsaWVudCI6Ii1XTFJoWVRUWWJzYkw5Ml9LTmJPdkEiLCJleHBpcnkiOiIyMDAwMDIwNzExIiwidWlkIjoidGFtYXMudHJvbWJpdGFzQGt1bGV1dmVuLmJlIn0=',
           'Access-Token': 'byBkoZ9Uc_gkCodH5VZ9KA',
}

IDs = list(range(1700, 1706))
results2 = pd.DataFrame()

for ID in IDs:
    body = {
        "operationName": "EventDetailed",
        "variables": {"id": ID},
        "query": "query EventDetailed($id: ID!) {eventSpecification(id: $id) {userData {id, title, startTime, endTime, url, tags {name}, locations {address, name, latitude, longitude}}}}"
    }
    
    response = requests.post(url, headers=headers, json=body)
    
    if response.status_code == 200:
        data = response.json()
        if data["data"]["eventSpecification"]!=None and data["data"]["eventSpecification"]["userData"]!=None:
            event_data = data["data"]["eventSpecification"]["userData"]
            
            # Extract the tags as a comma-separated string
            if len(data['data']['eventSpecification']['userData']['locations'])==0:
                tags = np.nan
            else:
                tags = ', '.join([tag['name'] for tag in event_data['tags']])
            # Extract the locations information
            locations = data['data']['eventSpecification']['userData']['locations']
            location_data = []
            if len(locations)==0:
                print("empty locations")
                location_data.append({
                        'address': np.nan,
                        'name': np.nan,
                        'latitude': np.nan,
                        'longitude': np.nan
                    })
            else:
                for loc in locations:
                    location_data.append({
                        'address': loc['address'],
                        'name': loc['name'],
                        'latitude': loc['latitude'],
                        'longitude': loc['longitude']
                    })
            result = pd.DataFrame({
                'id': [data['data']['eventSpecification']['userData']['id']],
                'title': [data['data']['eventSpecification']['userData']['title']],
                'startTime': [data['data']['eventSpecification']['userData']['startTime']],
                'endTime': [data['data']['eventSpecification']['userData']['endTime']],
                'url': [data['data']['eventSpecification']['userData']['url']],
                'tags': [tags],
                'address': [loc['address'] for loc in location_data],
                'name': [loc['name'] for loc in location_data],
                'latitude': [loc['latitude'] for loc in location_data],
                'longitude': [loc['longitude'] for loc in location_data]
            })

            # concatenate results
            results2 = pd.concat([results2, result], ignore_index=True)
            print(results2.tail())
        else:
            print(f"Error fetching data for ID {ID}. EventSpecification=None")
    else:
        print(f"Error fetching data for ID {ID}. Status code: {response.status_code}")

# Print the DataFrame
print(results2)
# save it to .csv
results2.to_csv('raw_data_fetched_2.csv', sep='\t', index=False)


     id                      title             startTime  \
0  1700  Closing Evening IFTf 2023  2023-05-24T17:45:30Z   

                endTime                                               url  \
0  2023-05-24T21:00:38Z  https://www.facebook.com/events/1308278623452852   

             tags                                 address  \
0  Culture, Party  Andreas Vesaliusstraat 11, 3000 Leuven   

                   name latitude longitude  
0  Aula Vesalius (VESA)     None      None  
     id                      title             startTime  \
0  1700  Closing Evening IFTf 2023  2023-05-24T17:45:30Z   
1  1701        Student Stuff Swap   2023-06-23T11:00:23Z   

                endTime                                               url  \
0  2023-05-24T21:00:38Z  https://www.facebook.com/events/1308278623452852   
1  2023-07-01T15:00:28Z                         https://fb.me/e/14Vhr7lgP   

                     tags                                 address  \
0          Culture, Party  An

# Shape and filter the data

In [None]:
# Convert 'startTime' column to datetime
results['startTime'] = pd.to_datetime(results['startTime'])
#results = pd.read_csv('raw_data_fetched.csv', sep='\t')

# Filter the DataFrame based on the condition
filter_date = results[(results['startTime'].dt.year >= 2022) & (results['startTime'].dt.year <= 2022)]

# Print the filtered DataFrame
print(filter_date.head())
filter_date.to_csv('filter_date.csv', sep='\t', index=False)
len(filter_date)

In [None]:
# Define the list of tags to filter
tags_to_filter = ['Cantus', 'Culture', 'First-year students', 'International', 'Party', 'Sport']

# Filter the DataFrame based on the condition
filter_date_tags = filter_date[filter_date['tags'].str.contains('|'.join(tags_to_filter), na=False)]

print(filter_date_tags.head())
len(filter_date_tags)
filter_date.to_csv('filter_date_tags.csv', sep='\t', index=False)

In [None]:
filter_date_tags = pd.read_csv('filter_date_tags.csv', sep='\t')
print(filter_date_tags.columns)
filter_date_tags['address'].unique()

In [None]:
# filter for location
def get_city_from_address(address):
    api_key = "AIzaSyDrmY-DrdEMZ8TnxuT84j9fqJy7DCireu8"
    api_url = f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={api_key}"

    # Send a GET request to the Geocoding API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        results = data.get('results', [])

        # Check if any results were found
        if len(results) > 0:
            city = None
            for component in results[0]['address_components']:
                if 'locality' in component['types']:
                    city = component['long_name']
                    break
            return city

    print("Error retrieving city for address:", address)
    return None

addresses = filter_date_tags['address']
filter_date_tags_city = filter_date_tags.copy()
filter_date_tags_city['city'] = ''

for i, address in enumerate(addresses):
    city = get_city_from_address(address)
    if city:
        filter_date_tags_city.at[i, 'city'] = city

print(filter_date_tags_city.head())

In [16]:
filter_date_tags_city.to_csv('filter_date_tags_city_raw.csv', sep='\t', index=False)
print(filter_date_tags_city['city'].unique())
filtered_city = filter_date_tags_city[filter_date_tags_city['city'] == 'Leuven']
print(filtered_city['city'].unique())

['Morzine' 'Vienna' 'Leuven' '' 'Oud-Heverlee' 'Capelle aan den IJssel'
 'Bruxelles' 'Maastricht' 'Balen' 'Brussels' 'Dilsen-Stokkem' 'Willebroek'
 'Gent' 'Geetbets' 'Lommel' 'Genk' 'Herent' 'Kasterlee' 'Den Burg'
 'Boortmeerbeek' 'Maarkedal' 'Diepenbeek' 'Holsbeek' 'Antwerpen'
 'Heist-op-den-Berg' 'Aarschot' 'Arnhem' 'Machelen' 'Enschede'
 'Saint-Gilles']
['Leuven']


In [17]:
filtered_city.to_csv('filter_date_tags_city.csv', sep='\t', index=False)

In [19]:
print(filtered_city['city'].isna().sum())
print(len(filtered_city))

0
404


In [20]:
filtered_city.head()

Unnamed: 0,id,title,startTime,endTime,url,tags,address,name,latitude,longitude,city
2,464,BEST Leuven x Twipe - Hackathon,2022-03-24 17:30:00+00:00,2022-03-24T19:30:00Z,https://fb.me/e/1vsfVR8mm,,"Gaston Geenslaan 8, 3000 Leuven",Twipe,,,Leuven
4,483,Interfacultair Welsprekendheidstoernooi 2022,2022-03-03 19:00:00+00:00,2022-03-03T22:00:00Z,https://www.facebook.com/events/45191322960811...,"Education, Culture","Andreas Vesaliusstraat 11, 3000 Leuven",Aula Vesalius,50.875235,4.708047,Leuven
9,492,Hacking maakleerplek,2022-03-09 16:00:00+00:00,2022-03-09T21:00:00Z,https://www.eventbrite.be/e/hacking-maakleerpl...,"Education, International, Workshop, Career",Stapelhuisstraat 13/15,maakleerplek,50.88658,4.705075,Leuven
10,493,Torchlight Walk,2022-02-10 17:00:00+00:00,2022-02-10T20:45:00Z,https://fb.me/e/1FCKGDfXh,"Sport, International, First-year students","Tervuursevest 101/bus 1502, 3001 Leuven",Universitair Sportcentrum KULeuven,,,Leuven
11,494,Four of a kind,2022-02-07 19:30:00+00:00,2022-02-07T21:30:00Z,https://fb.me/e/1CyC8EPqg,"Quiz, International, First-year students","Andreas Vesaliusstraat 34, 3000 Leuven",Pangaea KU Leuven lounge,,,Leuven


In [36]:
# convert 'startTime' and 'endTime' columns to datetime
filtered_city['startTime'] = pd.to_datetime(filtered_city['startTime'])
filtered_city['endTime'] = pd.to_datetime(filtered_city['endTime'])

# Set the start and end times for the range
start_time = datetime(2022, 1, 1, 0, 0)
end_time = datetime(2022, 12, 31, 23, 59)

# Create a list of hourly time ranges
time_range = pd.date_range(start=start_time, end=end_time, freq='H').strftime('%Y-%m-%d %H:%M:%S')

# Initialize an empty DataFrame to store the results
aggregated_events = pd.DataFrame(columns=['Time', 'Events'])
aggregated_events['Time'] = pd.to_datetime(aggregated_events['Time'])
# Iterate over the time ranges
for i in range(len(time_range) - 1):
    start = time_range[i]
    end = time_range[i + 1]
    

    # Filter the original DataFrame for events within the current time range
    filtered_data = filtered_city[(filtered_city['startTime'] <= end) & (filtered_city['endTime'] > start)]
    
    # Count the number of events in the current time range
    event_count = len(filtered_data)
    
    # Add a new row to the new DataFrame
    aggregated_events.loc[i] = [start, event_count]

# Fill any missing hours with 0 events
aggregated_events = aggregated_events.fillna(0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_city['startTime'] = pd.to_datetime(filtered_city['startTime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_city['endTime'] = pd.to_datetime(filtered_city['endTime'])


                     Time  Events
0     2022-01-01 00:00:00       0
1     2022-01-01 01:00:00       0
2     2022-01-01 02:00:00       0
3     2022-01-01 03:00:00       0
4     2022-01-01 04:00:00       0
...                   ...     ...
8754  2022-12-31 18:00:00       0
8755  2022-12-31 19:00:00       0
8756  2022-12-31 20:00:00       0
8757  2022-12-31 21:00:00       0
8758  2022-12-31 22:00:00       0

[8759 rows x 2 columns]


In [11]:
aggregated_events.to_csv('shaped_filter_tags_city2_EXAM.csv', sep='\t', index=False)

In [37]:
aggregated_events.to_csv('shaped_filter_date_tags_city.csv', sep='\t', index=False)
print(aggregated_events.isna().sum())
print(len(aggregated_events))

Time      0
Events    0
dtype: int64
8759


In [10]:
# Repeat the same steps for results2
results2 = pd.read_csv('raw_data_fetched_2.csv', sep='\t')
# Set time format
results2['startTime'] = pd.to_datetime(results2['startTime'])
# Filter for tags
tags_to_filter = ['Cantus', 'Culture', 'First-year students', 'International', 'Party', 'Sport']
filter_tags = results2[results2['tags'].str.contains('|'.join(tags_to_filter), na=False)]
# Filter for location
def get_city_from_address(address):
    api_key = "AIzaSyDrmY-DrdEMZ8TnxuT84j9fqJy7DCireu8"
    api_url = f"https://maps.googleapis.com/maps/api/geocode/json?address={address}&key={api_key}"

    # Send a GET request to the Geocoding API
    response = requests.get(api_url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        results = data.get('results', [])

        # Check if any results were found
        if len(results) > 0:
            city = "None"
            for component in results[0]['address_components']:
                if 'locality' in component['types']:
                    city = component['long_name']
                    break
            return city

    print("Error retrieving city for address:", address)
    return "Leuven" # if it is not specified, assume it usually takes place in Leuven

addresses = filter_tags['address']
filter_tags_city = filter_tags.copy()
filter_tags_city['city'] = ''

for i, address in enumerate(addresses):
    city = get_city_from_address(address)
    if city:
        filter_tags_city.at[i, 'city'] = city
filter_tags_city = filter_tags_city[filter_tags_city['city'] == 'Leuven']
# save filtered event database
filter_tags_city.to_csv('data_fetched_filter_tags_city_EXAM.csv', sep='\t', index=False)
# Aggregate event data for appropriate format for the model
## Convert 'startTime' and 'endTime' columns to datetime
filter_tags_city['startTime'] = pd.to_datetime(filter_tags_city['startTime'])
filter_tags_city['endTime'] = pd.to_datetime(filter_tags_city['endTime'])
## Set the start and end times for the range
start_time = datetime(2023, 1, 1, 0, 0)
end_time = datetime(2023, 12, 31, 23, 59)
## Create a list of hourly time ranges
time_range = pd.date_range(start=start_time, end=end_time, freq='H').strftime('%Y-%m-%d %H:%M:%S')
##Initialize an empty DataFrame to store the results
aggregated_events = pd.DataFrame(columns=['Time', 'Events'])
aggregated_events['Time'] = pd.to_datetime(aggregated_events['Time'])
## Iterate over the time ranges
for i in range(len(time_range) - 1):
    start = time_range[i]
    end = time_range[i + 1]
    # Filter the original DataFrame for events within the current time range
    filtered_data = filter_tags_city[(filter_tags_city['startTime'] <= end) & (filter_tags_city['endTime'] > start)]
    # Count the number of events in the current time range
    event_count = len(filtered_data) 
    # Add a new row to the new DataFrame
    aggregated_events.loc[i] = [start, event_count]
## Fill any missing hours with 0 events
aggregated_events = aggregated_events.fillna(0)
# Save "database"
aggregated_events.to_csv('shaped_filter_tags_city2_EXAM.csv', sep='\t', index=False)

Error retrieving city for address: nan
Error retrieving city for address: nan
Error retrieving city for address: nan
Error retrieving city for address: nan
Error retrieving city for address: nan
                  Time  Events
0  2023-01-01 00:00:00       0
1  2023-01-01 01:00:00       0
2  2023-01-01 02:00:00       0
3  2023-01-01 03:00:00       0
4  2023-01-01 04:00:00       0


In [14]:
aggregated_events['Events'].value_counts()

0    8557
1     183
2      19
Name: Events, dtype: int64