In [4]:
import os
import pandas as pd
import numpy as np
import pandas_gbq
import sqlite3
import zipfile
import csv
import io
import glob

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account
from zipfile import ZipFile
from google.cloud.exceptions import NotFound

## 1. Extracting Zip Files

In [None]:
# Set the directory path where your ZIP files are located locally
# Use a raw string for the path
#directory_path = "/Users/biancabostrom/Documents/ADA/Wedge Project/WedgeZipOfZips_Big"
directory_path = r'C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big'
output_folder = 'extracted_zips_big'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate over all the files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.zip'):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)

        # Create a folder for each ZIP file
        folder_name = os.path.splitext(filename)[0]
        extract_path = os.path.join(output_folder, folder_name)

        # Print the file path for debugging
        print(f"Attempting to extract: {file_path}")

        try:
            # Open the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # Extract all the contents into the directory
                zip_ref.extractall(extract_path)
                print(f"Extracted {filename} to {extract_path}")
        except Exception as e:
            print(f"Error extracting {filename}: {e}")

print("All files extracted.")


## 2. Cleaning files: headers, delimeters, nulls and quotes 

In [None]:
# Loop through each folder to check and add headers if different
correct_headers = [
    "datetime", "register_no", "emp_no", "trans_no", "upc", "description", "trans_type", "trans_subtype",
    "trans_status", "department", "quantity", "Scale", "cost", "unitPrice", "total", "regPrice", "altPrice",
    "tax", "taxexempt", "foodstamp", "wicable", "discount", "memDiscount", "discountable", "discounttype",
    "voided", "percentDiscount", "ItemQtty", "volDiscType", "volume", "VolSpecial", "mixMatch", "matched",
    "memType", "staff", "numflag", "itemstatus", "tenderstatus", "charflag", "varflag", "batchHeaderID", 
    "local", "organic", "display", "receipt", "card_no", "store", "branch", "match_id", "trans_id"
]

for this_folder in os.listdir(output_folder):
    folder_path = os.path.join(output_folder, this_folder)

    # Check if the entry is a directory
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            input_file_path = os.path.join(folder_path, file_name)

            # Read the content of the file
            with open(input_file_path, 'r', newline='', encoding='utf-8') as input_file:
                content = input_file.read()

            # Check if headers are present
            has_headers = all(header in content for header in correct_headers)

            if not has_headers:
                # Headers are missing, add them
                content = ",".join(correct_headers) + "\n" + content

                # Write the modified content back to the file
                with open(input_file_path, 'w', newline='', encoding='utf-8') as output_file:
                    output_file.write(content)

                print(f"Headers added to file {file_name} in folder {this_folder}.")

            print(f"File {file_name} in folder {this_folder} is good.")

print("Done checking and adding headers.")

In [None]:
# Look for delimiters
delimiters = dict()

# Start by reading in all the files again.
for this_folder in os.listdir(output_folder):
    folder_path = os.path.join(output_folder, this_folder)

    # Check if the entry is a directory
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):  # Check if the file is a CSV file
                input_file_path = os.path.join(folder_path, file_name)

                # Read the content of the file
                with open(input_file_path, 'r', newline='', encoding='utf-8') as input_file:
                    # Read only the first line to determine the delimiter
                    first_line = input_file.readline()

                    dialect = csv.Sniffer().sniff(sample=first_line, delimiters=[",", ";", "\t"])
                    delimiters[file_name] = dialect.delimiter

                    print(" ".join(["It looks like",
                                    file_name,
                                    "in folder",
                                    this_folder,
                                    "has delimiter",
                                    dialect.delimiter,
                                    "."]))

                # You can add the rest of your processing here


In [None]:
#loop through each file and replace the delimters to "," in the files that don't

# Clean the files
for this_zf in zip_files:
    with ZipFile(os.path.join(zip_folder, this_zf), 'a') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            input_file_path = os.path.join(zip_folder, this_zf, file_name)
            
            # Read the content of the file
            with zf.open(file_name, 'r') as input_file:
                content = input_file.read().decode("utf-8")
            
            # Check if the delimiter is not a comma
            if delimiters[file_name] != ",":
                # Replace the delimiter with a comma
                content = content.replace(delimiters[file_name], ",")

                # Write the modified content back to the file
                zf.writestr(file_name, content.encode("utf-8"))

                print(f"File {file_name} has been cleaned.")

print("Done cleaning.")

In [None]:
#replace empty values with "null"

for this_zf in zip_files:
    with ZipFile(os.path.join(zip_folder, this_zf), 'a') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            input_file_path = os.path.join(zip_folder, this_zf, file_name)

            # Read the content of the file
            with zf.open(file_name, 'r') as input_file:
                content = input_file.read().decode("utf-8")

            # Identify and replace null values (assuming nulls are represented as an empty string "")
            content = content.replace('""', 'null')

            # Write the modified content back to the file
            zf.writestr(file_name, content.encode("utf-8"))

            print(f"Null values handled in file {file_name}.")

print("Done checking and handling null values.")

## 3. Upload to GBQ

In [None]:
service_path = "/Users/biancabostrom/Documents/ADA/Wedge\ Project/wedge-404400-cb3a632effa5.json"
service_file = 'wedge-404400-cb3a632effa5.json' 
gbq_proj_id = "wedge-404400" 
gbq_dataset_id = "wedge_data"


private_key =service_path + service_file

In [None]:
credentials = service_account.Credentials.from_service_account_file("/Users/biancabostrom/Documents/ADA/Wedge Project/wedge-404400-cb3a632effa5.json")

client = bigquery.Client(credentials=credentials, project=gbq_proj_id)

In [None]:
#Use the Pandas function to_gbq to upload your data to GBQ.

#Iterate through each CSV file in the folder

for file_name in os.listdir(zip_folder):
    if file_name.endswith('.csv'):
        file_path = os.path.join(zip_folder, file_name)

        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)

        # Define the BigQuery table ID in the format 'project.dataset.table'
        table_id = f'your_project.your_dataset.{file_name.replace(".csv", "")}'

        # Upload the DataFrame to BigQuery using to_gbq
        df.to_gbq(destination_table=table_id, project_id='your_project', if_exists='replace')

print("Upload complete.")
