# Web Crawling of GDELT data from 2013 - 2015
GDELT makes all data from 2015 - present accessible via Google Big Query. For the data from 1979-2013 there is a masterfile available, that compresses the event data in that timeframe. For the time from 2013-2015 however, the data for the single days has to be downloaded seperately from their website. This is what this notebook is doing.

In [1]:
import os
import re
import requests
import zipfile

The links are extracted from the GDELT Website.

In [2]:
path_2014 = "./web_crawl_links/2014.txt"
path_2015 = "./web_crawl_links/2015.txt"

#reading contents of the text files, containing the links
with open(path_2014, 'r') as file:
    data_2014 = file.read()

with open(path_2015, 'r') as file:
    data_2015 = file.read()

#filtering out the links
pattern = r'([^\s]+\.zip)'
endings_2014 = re.findall(pattern, data_2014)
endings_2015 = re.findall(pattern, data_2015)

#building urls from links in text files
base_url = "http://data.gdeltproject.org/events/"
urls_2014 = [base_url + ending for ending in endings_2014]
urls_2015 = [base_url + ending for ending in endings_2015]


print(urls_2014)
print(urls_2015)

['http://data.gdeltproject.org/events/20141231.export.CSV.zip', 'http://data.gdeltproject.org/events/20141230.export.CSV.zip', 'http://data.gdeltproject.org/events/20141229.export.CSV.zip', 'http://data.gdeltproject.org/events/20141228.export.CSV.zip', 'http://data.gdeltproject.org/events/20141227.export.CSV.zip', 'http://data.gdeltproject.org/events/20141226.export.CSV.zip', 'http://data.gdeltproject.org/events/20141225.export.CSV.zip', 'http://data.gdeltproject.org/events/20141224.export.CSV.zip', 'http://data.gdeltproject.org/events/20141223.export.CSV.zip', 'http://data.gdeltproject.org/events/20141222.export.CSV.zip', 'http://data.gdeltproject.org/events/20141221.export.CSV.zip', 'http://data.gdeltproject.org/events/20141220.export.CSV.zip', 'http://data.gdeltproject.org/events/20141219.export.CSV.zip', 'http://data.gdeltproject.org/events/20141218.export.CSV.zip', 'http://data.gdeltproject.org/events/20141217.export.CSV.zip', 'http://data.gdeltproject.org/events/20141216.export.C

Now the extracted links are used to scrape the data from the GDELT servers and save it in the downloaded_files folder. WARNING: This may take up to 30 minutes, depending on your internet connection.

In [3]:
years = [urls_2014, urls_2015]

for year in years:
    for url in year:
        # Get the current working directory
        current_dir = os.getcwd()

        # Create a folder to save downloaded files if it doesn't exist
        folder_name = 'downloaded_files'
        folder_path = os.path.join(current_dir, folder_name)
        os.makedirs(folder_path, exist_ok=True)

        # Determine the file name from the URL
        file_name = url.split('/')[-1]

        # Specify the file path where the downloaded zip folder should be saved
        zip_file_path = os.path.join(folder_path, file_name)

        # Download the zip folder and save it to the specified location
        response = requests.get(url)
        with open(zip_file_path, 'wb') as file:
            file.write(response.content)

        # Extract the contents of the zip folder
        extract_folder_path = os.path.join(folder_path, file_name.split('.')[0])
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder_path)

        # Get the file inside the extracted folder
        files_in_extracted_folder = os.listdir(extract_folder_path)
        if len(files_in_extracted_folder) > 0:
            file_to_save = os.path.join(extract_folder_path, files_in_extracted_folder[0])

            # Specify the file path where the extracted file should be saved
            saved_file_path = os.path.join(folder_path, files_in_extracted_folder[0])
            
            # Move the file to the desired location
            os.rename(file_to_save, saved_file_path)
            print(f"File downloaded and saved to: {saved_file_path}")
        else:
            print("No files found inside the extracted folder.")

        # Clean up the extracted folder
        os.rmdir(extract_folder_path)
        print(f"Zip folder downloaded and extracted to: {extract_folder_path}")

        # Delete the downloaded zip folder
        os.remove(zip_file_path)
        print(f"Zip folder deleted: {zip_file_path}")

File downloaded and saved to: d:\Uni\4_Semester\Machine Learning Project\Data Collection\Code\project_submission_haiperformer\data_collection\downloaded_files\20141231.export.CSV
Zip folder downloaded and extracted to: d:\Uni\4_Semester\Machine Learning Project\Data Collection\Code\project_submission_haiperformer\data_collection\downloaded_files\20141231
Zip folder deleted: d:\Uni\4_Semester\Machine Learning Project\Data Collection\Code\project_submission_haiperformer\data_collection\downloaded_files\20141231.export.CSV.zip
File downloaded and saved to: d:\Uni\4_Semester\Machine Learning Project\Data Collection\Code\project_submission_haiperformer\data_collection\downloaded_files\20141230.export.CSV
Zip folder downloaded and extracted to: d:\Uni\4_Semester\Machine Learning Project\Data Collection\Code\project_submission_haiperformer\data_collection\downloaded_files\20141230
Zip folder deleted: d:\Uni\4_Semester\Machine Learning Project\Data Collection\Code\project_submission_haiperform

KeyboardInterrupt: 