# Downloading GhArchive 2023 Dataset

## January 2023 Data

In [2]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day):
  """Downloads data for a specific day in January 2023."""
  date = f"2023-01-{day:02d}"  # Format day with leading zero (01-31)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/January_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/January_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for all days in January
for day in range(1, 32):  # Loop through days (1 to 31)
  download_day(day)

print("Download complete!")

Downloaded: gharchive-2023-01-01-00.json.gz
Downloaded: gharchive-2023-01-01-01.json.gz
Downloaded: gharchive-2023-01-01-02.json.gz
Downloaded: gharchive-2023-01-01-03.json.gz
Downloaded: gharchive-2023-01-01-04.json.gz
Downloaded: gharchive-2023-01-01-05.json.gz
Downloaded: gharchive-2023-01-01-06.json.gz
Downloaded: gharchive-2023-01-01-07.json.gz
Downloaded: gharchive-2023-01-01-08.json.gz
Downloaded: gharchive-2023-01-01-09.json.gz
Downloaded: gharchive-2023-01-01-10.json.gz
Downloaded: gharchive-2023-01-01-11.json.gz
Downloaded: gharchive-2023-01-01-12.json.gz
Downloaded: gharchive-2023-01-01-13.json.gz
Downloaded: gharchive-2023-01-01-14.json.gz
Downloaded: gharchive-2023-01-01-15.json.gz
Downloaded: gharchive-2023-01-01-16.json.gz
Downloaded: gharchive-2023-01-01-17.json.gz
Downloaded: gharchive-2023-01-01-18.json.gz
Downloaded: gharchive-2023-01-01-19.json.gz
Downloaded: gharchive-2023-01-01-20.json.gz
Downloaded: gharchive-2023-01-01-21.json.gz
Downloaded: gharchive-2023-01-01

## February 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/February_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/February_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for February 2023
for day in range(1, 29):  # Loop through days (1 to 28, adjust for leap years)
  download_day(day, month=2)  # Specify month as 2 for February

print("Download complete!")

Downloaded: gharchive-2023-02-01-00.json.gz
Downloaded: gharchive-2023-02-01-01.json.gz
Downloaded: gharchive-2023-02-01-02.json.gz
Downloaded: gharchive-2023-02-01-03.json.gz
Downloaded: gharchive-2023-02-01-04.json.gz
Downloaded: gharchive-2023-02-01-05.json.gz
Downloaded: gharchive-2023-02-01-06.json.gz
Downloaded: gharchive-2023-02-01-07.json.gz
Downloaded: gharchive-2023-02-01-08.json.gz
Downloaded: gharchive-2023-02-01-09.json.gz
Downloaded: gharchive-2023-02-01-10.json.gz
Downloaded: gharchive-2023-02-01-11.json.gz
Downloaded: gharchive-2023-02-01-12.json.gz
Downloaded: gharchive-2023-02-01-13.json.gz
Downloaded: gharchive-2023-02-01-14.json.gz
Downloaded: gharchive-2023-02-01-15.json.gz
Downloaded: gharchive-2023-02-01-16.json.gz
Downloaded: gharchive-2023-02-01-17.json.gz
Downloaded: gharchive-2023-02-01-18.json.gz
Downloaded: gharchive-2023-02-01-19.json.gz
Downloaded: gharchive-2023-02-01-20.json.gz
Downloaded: gharchive-2023-02-01-21.json.gz
Downloaded: gharchive-2023-02-01

## March 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/March_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/March_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for March 2023
for day in range(1, 32):  # Loop through days (1 to 31)
  download_day(day, month=3)  # Specify month as 3 for March

print("Download complete!")


## April 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for April 2023
for day in range(1, 31):  # Loop through days (1 to 30) to capture all days in April
  download_day(day, month=4)  # Specify month as 4 for April

print("Download complete!")


## May 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for May 2023
for day in range(1, 32):  # Loop through days (1 to 31) to capture all days in May
  download_day(day, month=5)  # Specify month as 5 for May

print("Download complete!")


## June 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for June 2023
for day in range(1, 31):  # Loop through days (1 to 30) to capture all days in June
  download_day(day, month=6)  # Specify month as 6 for June

print("Download complete!")


## July 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for July 2023
for day in range(1, 32):  # Loop through days (1 to 31) to capture all days in July
  download_day(day, month=7)  # Specify month as 7 for July

print("Download complete!")


## August 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for August 2023
for day in range(1, 32):  # Loop through days (1 to 31) to capture all days in August
  download_day(day, month=8)  # Specify month as 8 for August

print("Download complete!")


## September 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for September 2023
for day in range(1, 31):  # Loop through days (1 to 30) to capture all days in September
  download_day(day, month=9)  # Specify month as 9 for September

print("Download complete!")


## October 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for October 2023
for day in range(1, 32):  # Loop through days (1 to 30) to capture all days in October
  download_day(day, month=10)  # Specify month as 10 for October

print("Download complete!")


## November 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for November 2023
for day in range(1, 31):  # Loop through days (1 to 30) to capture all days in November
  download_day(day, month=11)  # Specify month as 11 for November

print("Download complete!")


## December 2023 Data

In [None]:
import requests
import os

# Define base URL
base_url = "https://data.gharchive.org/"

def download_day(day, month):
  """Downloads data for a specific day in a month."""
  date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

  # Loop through all hours (00 to 23)
  for hour in range(24):
    url = f"{base_url}{date}-{hour}.json.gz"

    # Create filename with padding
    filename = f"gharchive-{date}-{hour:02d}.json.gz"

    # Ensure directory exists (optional)
    os.makedirs(f"D:/GhArchive/April_2023/files", exist_ok=True)  # Create directory structure

    # Download the file
    response = requests.get(url, stream=True)
    if response.status_code == 200:
      filepath = os.path.join("D:/GhArchive/April_2023/files", filename)
      with open(filepath, 'wb') as f:
        for chunk in response.iter_content(1024):
          f.write(chunk)
      print(f"Downloaded: {filename}")
    else:
      print(f"Error downloading {url}: {response.status_code}")

# Download data for December 2023
for day in range(1, 32):  # Loop through days (1 to 31) to capture all days in December
  download_day(day, month=12)  # Specify month as 9 for December

print("Download complete!")
