SCRIPT FOCUSED ON DOWNLOAD DATA FROM THE CFTC OFFICIAL WEBSITE AND INSERT INTO MY SQL DATABASE

Import libraries

In [1]:
import requests
from zipfile import ZipFile
import os
import pandas as pd
import pyodbc

Define base URL

In [2]:
base_url = "https://www.cftc.gov"

Define list of years I want to download

In [3]:
years_to_download = [2022, 2023]

Use pyodbc to innitialize the SQL database connection

In [4]:
server = 'server name'
database = 'database name'
username = 'username'
password = 'password'
conn_str = f'DRIVER={{SQL Server}};SERVER={server};DATABASE={database};UID={username};PWD={password}'
conn = pyodbc.connect(conn_str)
cursor = conn.cursor()

Begin the unzip process and insert into SQL database

In [5]:
for year in years_to_download:
    # Construct URL for the ZIP file (I had to search inside the website to know the file repository)
    zip_url = f"{base_url}/files/dea/history/fut_fin_txt_{year}.zip"

    # Send an HTTP GET request to download the ZIP file
    response = requests.get(zip_url)

    if response.status_code == 200:
        # Define the path to save the downloaded ZIP file
        zip_path = os.path.join(os.getcwd(), f"fut_fin_txt_{year}.zip")

        # Save the ZIP file to the current directory
        with open(zip_path, 'wb') as zip_file:
            zip_file.write(response.content)

        # Extract the ZIP file content
        with ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(os.getcwd())

        # Get the name of the extracted file
        extracted_files = zip_ref.namelist()

        # Get the file name assuming that exists only one file
        if extracted_files:
            extracted_file = extracted_files[0]

            # Construct the full path to the extracted file
            extracted_file_path = os.path.join(os.getcwd(), extracted_file)

            # Load the data from the extracted file into pandas dataframe
            df = pd.read_csv(extracted_file_path, delimiter=',', encoding='latin1')

            # Define the table name based on year
            table_name = f'COTData{year}'

            # SQL statment will drop table if already exists
            cursor.execute(f"IF OBJECT_ID('{table_name}', 'U') IS NOT NULL DROP TABLE {table_name}")

            # Create the table with columns matching the DF
            create_table_sql = f"CREATE TABLE {table_name} ( {', '.join([f'[{column}] NVARCHAR(MAX)' for column in df.columns])} )"
            cursor.execute(create_table_sql)

            # Insert the data into the created table
            for index, row in df.iterrows():
                insert_sql = f"INSERT INTO {table_name} ({', '.join(['[' + column + ']' for column in df.columns])}) VALUES ({', '.join(['?' for _ in range(len(df.columns))])})"
                cursor.execute(insert_sql, tuple(row))

            conn.commit()

            print(f"Downloaded and saved data for {year} in the table {table_name}")
        else:
            print(f"No files found in the extracted ZIP for {year}")
    else:
        print(f"Failed to download data for {year}")

# Close the database connection
conn.close()

print("Data download and insertion into SQL Server completed.")

Downloaded and saved data for 2022 in the table COTData2022
Downloaded and saved data for 2023 in the table COTData2023
Data download and insertion into SQL Server completed.
