In [2]:
import os
import pandas as pd
import numpy as np
import pandas_gbq
import sqlite3
import zipfile
import csv
import io
import glob

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account
from zipfile import ZipFile
from google.cloud.exceptions import NotFound

## 1. Extracting Zip Files

In [None]:
# Set the directory path where your ZIP files are located locally
# Use a raw string for the path
#directory_path = "/Users/biancabostrom/Documents/ADA/Wedge Project/WedgeZipOfZips_Big"
directory_path = r'C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big'
output_folder = 'extracted_zips_big'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate over all the files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.zip'):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)

        # Create a folder for each ZIP file
        folder_name = os.path.splitext(filename)[0]
        extract_path = os.path.join(output_folder)

        # Print the file path for debugging
        print(f"Attempting to extract: {file_path}")

        try:
            # Open the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # Extract all the contents into the directory
                zip_ref.extractall(extract_path)
                print(f"Extracted {filename} to {extract_path}")
        except Exception as e:
            print(f"Error extracting {filename}: {e}")

print("All files extracted.")


## 2. Cleaning files: headers, delimeters, nulls and quotes 

In [4]:
correct_headers = [
    "datetime", "register_no", "emp_no", "trans_no", "upc", "description", "trans_type", "trans_subtype",
    "trans_status", "department", "quantity", "Scale", "cost", "unitPrice", "total", "regPrice", "altPrice",
    "tax", "taxexempt", "foodstamp", "wicable", "discount", "memDiscount", "discountable", "discounttype",
    "voided", "percentDiscount", "ItemQtty", "volDiscType", "volume", "VolSpecial", "mixMatch", "matched",
    "memType", "staff", "numflag", "itemstatus", "tenderstatus", "charflag", "varflag", "batchHeaderID", 
    "local", "organic", "display", "receipt", "card_no", "store", "branch", "match_id", "trans_id"
]
# loop though all files in the directory
for root, dirs, files in os.walk(output_folder):
    for file in files:
        full_path = os.path.join(root, file)
        if file.endswith('.csv'):
            with open(full_path,'r') as f:
                first_line = f.readline().strip()

            with open(full_path,'r') as f:
                content = f.read()
            # check if the file likly has headers based on the first line
            if not first_line.startswith('"datetime"') and not first_line.startswith('datetime'):
                content = ','.join(correct_headers) + '\n' + content

            content = content.replace('\"','inch')

            with open(full_path,'w') as f:
                f.write(content)

In [5]:
# Bianca's info
#service_path = "/Users/biancabostrom/Documents/ADA/Wedge\ Project/wedge-404400-cb3a632effa5.json"
#service_file = 'wedge-404400-cb3a632effa5.json' 
#gbq_proj_id = "wedge-404400" 
#gbq_dataset_id = "wedge_data"
#credentials = service_account.Credentials.from_service_account_file("/Users/biancabostrom/Documents/ADA/Wedge Project/wedge-404400-cb3a632effa5.json")

# Spencer's info
service_path = r"C:\Users\hills\Documents\Fall2023\ADA\wedge-project\leafy-sunrise-403222-f51fcd80b921.json"
service_file = 'leafy-sunrise-403222-f51fcd80b921.json' # change this to your authentication information  
gbq_proj_id = "leafy-sunrise-403222" # change this to your project. 
gbq_dataset_id = "wedge_data"
credentials = service_account.Credentials.from_service_account_file(r"C:\Users\hills\Documents\Fall2023\ADA\wedge-project\leafy-sunrise-403222-f51fcd80b921.json")

private_key =service_path + service_file


In [21]:
chunk_size = 50000

def drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id):
    client = bigquery.Client(credentials=credentials, project=gbq_proj_id)
    table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

    try:
        client.delete_table(table_id)
        print(f"deleted table '{table_id}'")
    except NotFound:
        print(f"table '{table_id}' not found, skipping deletion.")

def detect_delimiter(filename):
    with open(filename,'r') as file:
        first_line = file.readline()
        return ";" if ";" in first_line else ","
    
def clean_dataframe(df):
    float_columns = [
        'register_no', 'emp_no', 'trans_no', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice'
        , 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype'
        , 'voided', 'percentDiscount', 'itemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'numflag'
        , 'itemstatus', 'tenderstatus', 'varflag', 'local', 'organic', 'reciept', 'card_no', 'store', 'branch', 'match_id'
        ,'trans_id'
    ]

    boolean_columns = [ 'memType', 'staff', 'batchHeaderID', 'display']

    string_columns = ['upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'charflag']

    for col in string_columns:
        if col in df.columns:
            df[col]  = df[col].astype(str)
            df [col] = df [col].str.replace('"', '', regex=False)

    for col in float_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df[float_columns] = df[float_columns].fillna(0)

    for col in boolean_columns:
        if col in df.columns:
            df[col] = df[col].astype(bool)

    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
    
    replace_strings = ["\\n", "\\\\", "nan", "NULL"]
    df.replace(replace_strings, "", inplace=True)

    for col in df.columns:
        if df[col].dype == 'object':
            df[col] = df[col].str.strip()
            #df[col] = df[col].str.replace('\\\"', '', regex=False)

    for col in df.columns: # added these three lines trying to problem solve next chuck for gbq file path
        if col == 'charflag' and col in df.columns:
            df[col] = df[col].str.strip()

    df = df.applymap(lambda x: None if x == '' else x)

In [22]:
#duplicate code to play with

# move through all files in the directory
for root, dirs, files in os.walk(output_folder):
    for file in files:
        full_path = os.path.join(root, file)
        
        if file.endswith('.csv'):
            print(f"Found CSV file: {file}")

            delimiter = detect_delimiter(full_path)
            print(f"detected delimiter: {delimiter}")

            # reading csv with correct handling of quoted fields
            chunk_iter = pd.read_csv(full_path, delimiter=delimiter, quotechar='"', chunksize=chunk_size, dtype=str, low_memory=False)

            table_name = file.replace('.csv', '')

            # drop the table if it exists
            drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id)
            client = bigquery.Client(credentials=credentials, project=gbq_proj_id)
    
            # Construct the fully-qualified table_id without ".csv" extension
            table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

            try:
                client.delete_table(table_id)
                print(f"deleted table '{table_id}'")
            except NotFound:
                print(f"table '{table_id}' not found, skipping deletion.")

            print(f"reading csv file in chunks: {file}...")
            for idx, chunk_df in enumerate(chunk_iter):
                # clean the DF
                chunk_df = clean_dataframe(chunk_df)

                # modify the field names to comply with the gbq rules
                chunk_df.columns = [col.lower().replace(';', '') for col in chunk_df.columns]

                print(f"uploading chunk {idx + 1} to {table_name}...")
                if idx == 0:
                    # for the first chunk create the table with the defined schema
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='replace', credentials=credentials, table_schema=schema)
                else:
                    # for subsequent chunks, append to the table 
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='append', credentials=credentials)
                del chunk_df  # clean the chunk from memory
        break
    break


Found CSV file: transArchive_201001_201003.csv
detected delimiter: ,
table 'leafy-sunrise-403222.wedge_data.transArchive_201001_201003' not found, skipping deletion.
table 'leafy-sunrise-403222.wedge_data.transArchive_201001_201003' not found, skipping deletion.
reading csv file in chunks: transArchive_201001_201003.csv...


KeyError: "None of [Index(['register_no', 'emp_no', 'trans_no', 'department', 'quantity', 'Scale',\n       'cost', 'unitPrice', 'total', 'regPrice', 'altPrice', 'tax',\n       'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount',\n       'discountable', 'discounttype', 'voided', 'percentDiscount', 'itemQtty',\n       'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'numflag',\n       'itemstatus', 'tenderstatus', 'varflag', 'local', 'organic', 'reciept',\n       'card_no', 'store', 'branch', 'match_id', 'trans_id'],\n      dtype='object')] are in the [columns]"

In [18]:
# move through all files in the directory
for root, dirs, files in os.walk(output_folder):
    for file in files:
        full_path = os.path.join(root, file)
        
        if file.endswith('.csv'):
            print(f"Found CSV file: {file}")

            delimiter = detect_delimiter(full_path)
            print(f"detected delimiter: {delimiter}")

            #reading csv with correct handeling of quoted fields
            chunk_iter = pd.read_csv(full_path, delimiter=delimiter,quotechar='"', chunksize=chunk_size, dtype=str, low_memory=False)

            table_name = file.replace('data.csv', '')

            #drop the table if it exists
            drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id)
            client = bigquery.Client(credentials=credentials, project=gbq_proj_id)
    
                # Construct the fully-qualified table_id
            table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

            try:
                client.delete_table(table_id)
                print(f"deleted table '{table_id}'")
            except NotFound:
                print(f"table '{table_id}' not found, skipping deletion.")

            print(f"reading csv file in chucks: {file}...")
            for idx, chunk_df in enumerate(chunk_iter):
                #clean the DF
                chunk_df = clean_dataframe(chunk_df)

                #modify the field names to comply with the gbq rules
                chunk_df.columns = [col.lower().replace(';','') for col in chunk_df.columns]

                print(f"uploading chunk {idx + 1} to {table_name}...")
                if idx == 0:
                    # for the first chuck create the table with the defined schema
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='replace', credentials=credentials, table_schema=schema)
                else:
                    # for subsuquent chunks, append to the table 
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='append', credentials=credentials)
                del chunk_df # clean the chunk from memory
        break
    break

Found CSV file: transArchive_201001_201003.csv
detected delimiter: ,


ValueError: table_id must be a fully-qualified ID in standard SQL format, e.g., "project.dataset.table_id", got leafy-sunrise-403222.wedge_data.transArchive_201001_201003.csv

In [None]:
# Look for delimiters first
delimiters = dict()

# Start by reading in all the files again.
for this_folder in os.listdir(output_folder):
    folder_path = os.path.join(output_folder, this_folder)

    # Check if the entry is a directory
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):  # Check if the file is a CSV file
                input_file_path = os.path.join(folder_path, file_name)

                # Read the content of the file
                with open(input_file_path, 'r', newline='', encoding='utf-8') as input_file:
                    # Read only the first line to determine the delimiter
                    first_line = input_file.readline()

                    dialect = csv.Sniffer().sniff(sample=first_line, delimiters=[",", ";", "\t"])
                    delimiters[file_name] = dialect.delimiter

                    print(" ".join(["It looks like",
                                    file_name,
                                    "in folder",
                                    this_folder,
                                    "has delimiter",
                                    dialect.delimiter,
                                    "."]))

                # You can add the rest of your processing here


In [None]:
#loop through each file and replace the delimters to "," in the files that don't

# Clean the files
for this_zf in zip_files:
    with ZipFile(os.path.join(zip_folder, this_zf), 'a') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            input_file_path = os.path.join(zip_folder, this_zf, file_name)
            
            # Read the content of the file
            with zf.open(file_name, 'r') as input_file:
                content = input_file.read().decode("utf-8")
            
            # Check if the delimiter is not a comma
            if delimiters[file_name] != ",": # might be useful
                # Replace the delimiter with a comma # might be useful
                content = content.replace(delimiters[file_name], ",")

                # Write the modified content back to the file
                zf.writestr(file_name, content.encode("utf-8"))

                print(f"File {file_name} has been cleaned.")

print("Done cleaning.")

In [None]:
#replace empty values with "null" #

for this_zf in zip_files:
    with ZipFile(os.path.join(zip_folder, this_zf), 'a') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            input_file_path = os.path.join(zip_folder, this_zf, file_name)

            # Read the content of the file
            with zf.open(file_name, 'r') as input_file:
                content = input_file.read().decode("utf-8")

            # Identify and replace null values (assuming nulls are represented as an empty string "")
            content = content.replace('""', 'null')

            # Write the modified content back to the file
            zf.writestr(file_name, content.encode("utf-8"))

            print(f"Null values handled in file {file_name}.")

print("Done checking and handling null values.")

## 3. Upload to GBQ