In [2]:
import os
import pandas as pd
import numpy as np
import pandas_gbq
import sqlite3
import zipfile
import csv
import io
import glob

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account
from zipfile import ZipFile
from google.cloud.exceptions import NotFound

## 1. Extracting Zip Files

In [3]:
# Set the directory path where your ZIP files are located locally
# Use a raw string for the path
#directory_path = "/Users/biancabostrom/Documents/ADA/Wedge Project/WedgeZipOfZips_Big"
directory_path = r'C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big'
output_folder = 'data\\extracted_zips_big'
clean_output_folder = 'data\\clean_csvs'

## Extract CSV's from zips

In [4]:
# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate over all the files in the directory
for idx, filename in enumerate(os.listdir(directory_path)) : # JC: I added the enumerate so I can test on small samples.
    if filename.endswith('.zip'):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)

        # Print the file path for debugging
        print(f"Attempting to extract: {file_path}")

        try:
            # Open the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # Extract all the contents into the directory
                zip_ref.extractall(output_folder) # JC: so I changed this
                print(f"Extracted {filename} to {output_folder}")
        except Exception as e:
            print(f"Error extracting {filename}: {e}")

        #if idx > 10:
            #break
print("All files extracted.")

Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201001_201003.zip
Extracted transArchive_201001_201003.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201004_201006.zip
Extracted transArchive_201004_201006.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201007_201009.zip
Extracted transArchive_201007_201009.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201010_201012.zip
Extracted transArchive_201010_201012.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201101_201103.zip
Extracted transArchive_201101_201103.zip to data\extracted_zips_big
Attempting

## Turn messy CSV to clean CSV

In [5]:
os.makedirs(clean_output_folder, exist_ok=True)

In [6]:
correct_headers = [
    "datetime", "register_no", "emp_no", "trans_no", "upc", "description", "trans_type", "trans_subtype",
    "trans_status", "department", "quantity", "Scale", "cost", "unitPrice", "total", "regPrice", "altPrice",
    "tax", "taxexempt", "foodstamp", "wicable", "discount", "memDiscount", "discountable", "discounttype",
    "voided", "percentDiscount", "ItemQtty", "volDiscType", "volume", "VolSpecial", "mixMatch", "matched",
    "memType", "staff", "numflag", "itemstatus", "tenderstatus", "charflag", "varflag", "batchHeaderID", 
    "local", "organic", "display", "receipt", "card_no", "store", "branch", "match_id", "trans_id"
]
# loop though all files in the directory
# JC: you did this in a bizarre way. Compare to this.

extracted_files = os.listdir(output_folder)

for file in extracted_files : 
    # Now we pick up with yours. 
    if file.endswith('.csv'):
        with open(output_folder + "/" + file,'r') as f:
            first_line = f.readline().strip()

        print(first_line)

        # Handle different delimiters
        if "datetime" in first_line:
            # Check for comma as delimiter
            if "," in first_line:
                
                messy_data = pd.read_csv(output_folder + "/" + file, sep=",")
        
            elif ";" in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")
            else:
                print(f"Neither , or ; in {file}")
        else:
            # Add headers to the file and then read it
            if "," in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)
        
            elif ";" in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep= ";", header=None, names=correct_headers)
            else:
                print(f"Neither , or ; in {file}")

        messy_data.to_csv(clean_output_folder + "/" + file.replace("csv", "txt"), sep="\t", index = False)

        ### Work on null NULL \\N - replace with "" (this empty string)
        messy_data.replace(["NULL", "\\N"], "", inplace=True)  ## Chat said to put this above the above line.
        ### some files are , delimited and some product desc have , in them. PD might handle. 

        #if '"' in first_line : 
            #with open(output_folder + "/" + file,'r') as f:
                #content = f.read()
                #print(content[:1000])
                #break

"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"
"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","cha

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"
datetime,register_no,emp_no,trans_no,upc,description,trans_type,trans_subtype,trans_status,department,quantity,Scale,cost,unitPrice,total,regPrice,altPrice,tax,taxexempt,foodstamp,wicable,discount,memDiscount,discountable,discounttype,voided,percentDiscount,ItemQtty,volDiscType,volume,VolSpecial,mixMatch,matched,memType,staff,numflag,itemstatus,tenderstatus,charflag,varflag,batchHeaderID,local,organic,display,receipt,card_no,store,branc

  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"
"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","cha

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"
"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","cha

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2015-12-01 07:03:06,51,94,2,TAX,Tax,A,,,0,0,0,0.0000,0.0000,0.0000,0.0000,,0,0,0,,0.0000,0.0000,0,0,0,,0,0,0,0.0000,0,0,0,,0,0,0,,,,0,,,0,3,1,3,0,7
2016-01-01 09:12:14,51,94,3,0000000000039,Wedge Scone,I, , ,8,3,0,0.5160,2.4900,7.4700,2.4900,\N,0,0,1,\N,0.0000,0.0000,1,0,0,\N,3,0,0,0.0000,0,0,0,\N,5,0,0,,\N,\N,0,0,,0,3,1,3,0,10
2016-02-01 07:16:56,51,94,3,TAX,Tax,A,,,0,0,0,0.0000,0.0000,0.2700,0.0000,\N,0,0,0,\N,0.0000,0.0000,0,0,0,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,3,1,3,0,10
2016-03-01 07:04:38,51,94,2,TAX,Tax,A,,,0,0,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,\N,0.0000,0.0000,0,0,0,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,3,1,3,0,10
2016-04-01 07:34:35,51,94,18,0,Cash,T,CA,,0,0,0,0.0000,0.0000,-5.0000,0.0000,\N,0,0,0,\N,0.0000,0.0000,0,0,0,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,49019,1,3,0,9
2016-05-01 11:23:35,51,94,113,0000000004365,BBOWL SuperSoba Chicken/Seitan,I, , ,8,1,0,0.0000,10.0000,10.0000,10.0000,\N,1,0,0,0,0.0000,0.0000,7,0,0,0.00000000,1,0,0,0.0000,0,

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-06-01 08:04:44,51,94,41,0000000000151,Banana Organic,I, , ,2,1.45,1,0.8900,1.1900,1.7300,1.1900,\N,0,0,1,1,0.0000,0.0000,1,0,0,10.00000000,1.45,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,1,,0,12367,1,3,0,2
2016-07-01 07:06:15,51,94,1,0065722700050,Electrolyte Water 1.5L Essenti,I, , ,1,3,0,1.5800,2.6900,8.0700,2.6900,\N,0,0,1,0,0.0000,0.0000,1,0,0,0.00000000,3,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,0,,0,3,1,3,0,1


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-08-01 07:34:16,51,94,7,0000000000151,Banana Organic,I, , ,2,0.52,1,0.8900,1.1900,0.6200,1.1900,\N,0,0,1,1,0.0000,0.0000,1,0,0,0.00000000,0.52,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,1,,0,21998,1,3,0,2


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-09-01 07:13:09,51,94,6,0,Change,T,CA,,0,0,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,0,0.0000,0.0000,0,0,8,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,20074,1,3,0,7


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-10-01 07:04:40,51,94,1,DISCOUNT,Discount,I,,,0,1,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,0,0.0000,0.0000,0,0,0,\N,1,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,49355,1,3,0,10


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-11-01 07:18:44,51,94,11,0000000001014,Green Patch Redemption,I, , ,1,1,0,0.0000,-0.1000,-0.1000,-0.1000,\N,0,0,0,0,0.0000,0.0000,0,0,0,10.00000000,1,0,0,0.0000,0,0,0,\N,4,0,0,,\N,\N,0,-1,,0,16646,1,3,0,13
2016-12-01 07:43:01,51,94,23,0000000000049,Wedge Muffin,I, , ,8,1,0,0.6350,2.4900,2.4900,2.4900,\N,0,0,1,0,0.0000,0.0000,7,0,0,10.00000000,1,0,0,0.0000,0,0,0,\N,5,0,0,,\N,\N,0,0,,0,13863,1,3,0,2


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2017-01-01 09:00:31,51,94,12,0,Change,T,CA,,0,0,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,0,0.0000,0.0000,0,0,8,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,24528,1,3,0,12


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


## 2. Cleaning files: headers, delimeters, nulls and quotes 

correct_headers = [
    "datetime", "register_no", "emp_no", "trans_no", "upc", "description", "trans_type", "trans_subtype",
    "trans_status", "department", "quantity", "Scale", "cost", "unitPrice", "total", "regPrice", "altPrice",
    "tax", "taxexempt", "foodstamp", "wicable", "discount", "memDiscount", "discountable", "discounttype",
    "voided", "percentDiscount", "ItemQtty", "volDiscType", "volume", "VolSpecial", "mixMatch", "matched",
    "memType", "staff", "numflag", "itemstatus", "tenderstatus", "charflag", "varflag", "batchHeaderID", 
    "local", "organic", "display", "receipt", "card_no", "store", "branch", "match_id", "trans_id"
]
# loop though all files in the directory
# JC: you did this in a bizarre way. Compare to this.

extracted_files = os.listdir(output_folder)

for file in extracted_files : 
    # Now we pick up with yours. 
    if file.endswith('.csv'):
        with open(output_folder + "/" + file,'r') as f:
            first_line = f.readline().strip()

        print(first_line)

        if '"' in first_line : 
            with open(output_folder + "/" + file,'r') as f:
                content = f.read()
                print(content[:1000])
                break






#########

correct_headers = [
    "datetime", "register_no", "emp_no", "trans_no", "upc", "description", "trans_type", "trans_subtype",
    "trans_status", "department", "quantity", "Scale", "cost", "unitPrice", "total", "regPrice", "altPrice",
    "tax", "taxexempt", "foodstamp", "wicable", "discount", "memDiscount", "discountable", "discounttype",
    "voided", "percentDiscount", "ItemQtty", "volDiscType", "volume", "VolSpecial", "mixMatch", "matched",
    "memType", "staff", "numflag", "itemstatus", "tenderstatus", "charflag", "varflag", "batchHeaderID", 
    "local", "organic", "display", "receipt", "card_no", "store", "branch", "match_id", "trans_id"
]
# loop though all files in the directory
for root, dirs, files in os.walk(output_folder):
    for file in files:
        full_path = os.path.join(root, file)
        if file.endswith('.csv'):
            with open(full_path,'r') as f:
                first_line = f.readline().strip()

            with open(full_path,'r') as f:
                content = f.read()
            # check if the file likely has headers based on the first line
            if not first_line.startswith('"datetime"') and not first_line.startswith('datetime'):
                content = ','.join(correct_headers) + '\n' + content

            #content = content.replace('\"','inch')

            #with open(full_path,'w') as f:``
                #f.write(content)

            content = '\n'.join(['inch' + line.strip('\"') + 'inch' for line in content.split('\n')])

            with open(full_path, 'w') as f:
                f.write(content)


In [18]:
# Bianca's info
#service_path = "/Users/biancabostrom/Documents/ADA/Wedge\ Project/wedge-404400-cb3a632effa5.json"
#service_file = 'wedge-404400-cb3a632effa5.json' 
#gbq_proj_id = "wedge-404400" 
#gbq_dataset_id = "wedge_data"
#credentials = service_account.Credentials.from_service_account_file("/Users/biancabostrom/Documents/ADA/Wedge Project/wedge-404400-cb3a632effa5.json")

# Spencer's info
service_path = r"C:\Users\hills\Documents\Fall2023\ADA\wedge-project\leafy-sunrise-403222-f51fcd80b921.json"
service_file = 'leafy-sunrise-403222-f51fcd80b921.json' # change this to your authentication information  
gbq_proj_id = "leafy-sunrise-403222" # change this to your project. 
gbq_dataset_id = "wedge_data"
credentials = service_account.Credentials.from_service_account_file(service_path)

private_key = service_path + service_file


In [19]:
schema = [
    {"name": "datetime", "type": "TIMESTAMP"},     # 1
    {"name": "register_no", "type": "FLOAT"},      # 2
    {"name": "emp_no", "type": "FLOAT"},           # 3
    {"name": "trans_no", "type": "FLOAT"},         # 4
    {"name": "upc", "type": "STRING"},             # 5
    {"name": "description", "type": "STRING"},     # 6
    {"name": "trans_type", "type": "STRING"},      # 7
    {"name": "trans_subtype", "type": "STRING"},   # 8
    {"name": "trans_status", "type": "STRING"},    # 9
    {"name": "department", "type": "FLOAT"},       # 10
    {"name": "quantity", "type": "FLOAT"},         # 11
    {"name": "Scale", "type": "FLOAT"},            # 12
    {"name": "cost", "type": "FLOAT"},             # 13
    {"name": "unitPrice", "type": "FLOAT"},        # 14
    {"name": "total", "type": "FLOAT"},            # 15
    {"name": "regPrice", "type": "FLOAT"},         # 16
    {"name": "altPrice", "type": "FLOAT"},         # 17
    {"name": "tax", "type": "FLOAT"},              # 18
    {"name": "taxexempt", "type": "FLOAT"},        # 19
    {"name": "foodstamp", "type": "FLOAT"},        # 20
    {"name": "wicable", "type": "FLOAT"},          # 21
    {"name": "discount", "type": "FLOAT"},         # 22
    {"name": "memDiscount", "type": "FLOAT"},      # 23
    {"name": "discountable", "type": "FLOAT"},     # 24
    {"name": "discounttype", "type": "FLOAT"},     # 25
    {"name": "voided", "type": "FLOAT"},           # 26
    {"name": "percentDiscount", "type": "FLOAT"},  # 27
    {"name": "ItemQtty", "type": "FLOAT"},         # 28
    {"name": "volDiscType", "type": "FLOAT"},      # 29
    {"name": "volume", "type": "FLOAT"},           # 30
    {"name": "VolSpecial", "type": "FLOAT"},       # 31
    {"name": "mixMatch", "type": "FLOAT"},         # 32
    {"name": "matched", "type": "FLOAT"},          # 33
    {"name": "memType", "type": "BOOLEAN"},        # 34
    {"name": "staff", "type": "BOOLEAN"},          # 35
    {"name": "numflag", "type": "FLOAT"},          # 36
    {"name": "itemstatus", "type": "FLOAT"},       # 37
    {"name": "tenderstatus", "type": "FLOAT"},     # 38
    {"name": "charflag", "type": "STRING"},        # 39
    {"name": "varflag", "type": "FLOAT"},          # 40
    {"name": "batchHeaderID", "type": "BOOLEAN"},  # 41
    {"name": "local", "type": "FLOAT"},            # 42
    {"name": "organic", "type": "FLOAT"},          # 43
    {"name": "display", "type": "BOOLEAN"},        # 44
    {"name": "receipt", "type": "FLOAT"},          # 45
    {"name": "card_no", "type": "FLOAT"},          # 46
    {"name": "store", "type": "FLOAT"},            # 47
    {"name": "branch", "type": "FLOAT"},           # 48
    {"name": "match_id", "type": "FLOAT"},         # 49
    {"name": "trans_id", "type": "FLOAT"}          # 50
]


In [22]:
# Set up BigQuery client
client = bigquery.Client(project=gbq_proj_id, credentials=credentials)

# Loop through all files in the clean output folder
for file in os.listdir(clean_output_folder):
    if file.endswith('.txt'):
        # Read the cleaned data from the .txt file
        cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')

        # Create a BigQuery table name using the file name
        table_name = file.replace('.txt', '')

        # Define the BigQuery schema
        schema = [
            {"name": col, "type": cleaned_data[col].dtype.name.lower()}
            for col in cleaned_data.columns
        ]

        # Create the BigQuery table
        table_ref = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"
        job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")

        try:
            # Upload data to BigQuery
            cleaned_data.to_gbq(destination_table=table_ref, project_id=gbq_proj_id, if_exists="replace")
            print(f"Data uploaded to BigQuery table: {table_name}")
        except Exception as e:
            print(f"Error uploading data to BigQuery table {table_name}: {e}")



100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201001_201003


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201004_201006


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201007_201009


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201010_201012


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201101_201103


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201104


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201105


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201106


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201107_201109


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201110_201112


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201201_201203


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201201_201203_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201204_201206: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201204_201206_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201207_201209


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201207_201209_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201210_201212


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201210_201212_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201301_201303


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201301_201303_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201304_201306


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201304_201306_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201307_201309


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201307_201309_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201310_201312


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201310_201312_inactive


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201401_201403: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201401_201403_inactive: Expected bytes, got a 'float' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')
100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201404_201406


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201404_201406_inactive


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201407_201409


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201407_201409_inactive


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201410_201412


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201410_201412_inactive


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201501_201503


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201504_201506


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201507_201509


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201510


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201511: Expected bytes, got a 'int' object


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201512


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201601


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201602


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201603


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201604


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201605: Expected bytes, got a 'int' object


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201606


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201607: Expected bytes, got a 'int' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201608: Expected bytes, got a 'int' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201609: Expected bytes, got a 'int' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201610: Expected bytes, got a 'int' object


100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201611


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201612: Expected bytes, got a 'int' object


  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Error uploading data to BigQuery table transArchive_201701: Expected bytes, got a 'int' object


## 3. Upload to GBQ

In [16]:
# Move through all files in the directory
for root, dirs, files in os.walk(clean_output_folder):
    for file in files:
        full_path = os.path.join(root, file)

        if file.endswith('.txt'):
            print(f"Found TXT file: {file}")

            delimiter = detect_delimiter(full_path)
            print(f"Detected delimiter: {delimiter}")

            # Reading TXT with correct handling of quoted fields
            df = pd.read_csv(full_path, delimiter=delimiter, quotechar='"', dtype=str, low_memory=False)

            table_name = file.replace('.txt', '')

            # Drop the table if it exists
            drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id)
            client = bigquery.Client(credentials=credentials, project=gbq_proj_id)

            # Construct the fully-qualified table_id without ".txt" extension
            table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

            try:
                client.delete_table(table_id)
                print(f"Deleted table '{table_id}'")
            except NotFound:
                print(f"Table '{table_id}' not found, skipping deletion.")

            # Explicitly convert columns to the correct data types
            # Adjust these conversions based on your actual column names and data types
            #df['column_name_1'] = pd.to_numeric(df['column_name_1'], errors='coerce')
            #df['column_name_2'] = pd.to_numeric(df['column_name_2'], errors='coerce')
            # Repeat the above line for any columns causing the conversion error

            # Clean the DataFrame
            df = clean_dataframe(df)

            # Modify the field names to comply with the gbq rules
            df.columns = [col.lower().replace(';', '') for col in df.columns]

            print(f"Uploading {table_name} to BigQuery...")
            pandas_gbq.to_gbq(df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='replace', credentials=credentials, table_schema=schema)
            del df  # Clean the DataFrame from memory


Found TXT file: transArchive_201001_201003.txt
Detected delimiter: None
Table 'leafy-sunrise-403222.wedge_data.transArchive_201001_201003' not found, skipping deletion.
Uploading transArchive_201001_201003 to BigQuery...


GenericGBQException: Reason: 400 POST https://bigquery.googleapis.com/bigquery/v2/projects/leafy-sunrise-403222/datasets/wedge_data/tables?prettyPrint=false: Invalid field name "datetime	register_no	emp_no	trans_no	upc	description	trans_type	trans_subtype	trans_status	department	quantity	scale	cost	unitprice	total	regprice	altprice	tax	taxexempt	foodstamp	wicable	discount	memdiscount	discountable	discounttype	voided	percentdiscount	itemqtty	voldisctype	volume	volspecial	mixmatch	matched	memtype	staff	numflag	itemstatus	tenderstatus	charflag	varflag	batchheaderid	local	organic	display	receipt	card_no	store	branch	match_id	trans_id". Fields must contain the allowed characters, and be at most 300 characters long. For allowed characters, please refer to https://cloud.google.com/bigquery/docs/schemas#column_names

# Hi John - cells below are for reference 

# Testing code chunk

chunk_size = 50000

def drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id):
    client = bigquery.Client(credentials=credentials, project=gbq_proj_id)
    table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

    try:
        client.delete_table(table_id)
        print(f"deleted table '{table_id}'")
    except NotFound:
        print(f"table '{table_id}' not found, skipping deletion.")

def detect_delimiter(filename):
    with open(filename,'r') as file:
        first_line = file.readline()
        return ";" if ";" in first_line else ","
    
def clean_dataframe(df):
    float_columns = [
        'register_no', 'emp_no', 'trans_no', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice'
        , 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype'
        , 'voided', 'percentDiscount', 'ItemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'numflag'
        , 'itemstatus', 'tenderstatus', 'varflag', 'local', 'organic', 'receipt', 'card_no', 'store', 'branch', 'match_id'
        ,'trans_id'
    ]

    boolean_columns = [ 'memType', 'staff', 'batchHeaderID', 'display']

    string_columns = ['upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'charflag']

    for col in string_columns:
        if col in df.columns:
            df[col]  = df[col].astype(str)
            df[col] = df[col].str.replace('"', '', regex=False)

    for col in float_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df[float_columns] = df[float_columns].fillna(0)

    for col in boolean_columns:
        if col in df.columns:
            df[col] = df[col].astype(bool)

    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

    replace_strings = ["\\n", "\\\\", "nan", "NULL"]
    df.replace(replace_strings, "", inplace=True)

    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip()

    for col in ['itemQtty', 'reciept']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace('"', '', regex=False)

    # Print the first 20 rows for inspection
    print("First 20 rows after cleaning:")
    print(df.head(20))

    df = df.applymap(lambda x: None if x == '' else x)

    return df




# hold on to this while I test another chunk - more than likley delete

chunk_size = 50000

def drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id):
    client = bigquery.Client(credentials=credentials, project=gbq_proj_id)
    table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

    try:
        client.delete_table(table_id)
        print(f"deleted table '{table_id}'")
    except NotFound:
        print(f"table '{table_id}' not found, skipping deletion.")

def detect_delimiter(filename):
    with open(filename,'r') as file:
        first_line = file.readline()
        return ";" if ";" in first_line else ","
    
def clean_dataframe(df):
    float_columns = [
        'register_no', 'emp_no', 'trans_no', 'department', 'quantity', 'Scale', 'cost', 'unitPrice', 'total', 'regPrice'
        , 'altPrice', 'tax', 'taxexempt', 'foodstamp', 'wicable', 'discount', 'memDiscount', 'discountable', 'discounttype'
        , 'voided', 'percentDiscount', 'itemQtty', 'volDiscType', 'volume', 'VolSpecial', 'mixMatch', 'matched', 'numflag'
        , 'itemstatus', 'tenderstatus', 'varflag', 'local', 'organic', 'reciept', 'card_no', 'store', 'branch', 'match_id'
        ,'trans_id'
    ]

    boolean_columns = [ 'memType', 'staff', 'batchHeaderID', 'display']

    string_columns = ['upc', 'description', 'trans_type', 'trans_subtype', 'trans_status', 'charflag']

        # Check if columns exist before applying operations

    # List of columns to check and clean
    #columns_to_clean = ['itemQtty', 'reciept']

   # for col in columns_to_clean:
    #    if col in df.columns:
     #       if col == 'charflag':
      #          df[col] = df[col].str.strip()
       #     elif col in ['itemQtty', 'reciept']:
        #        df[col] = df[col].astype(str).str.replace('"', '', regex=False)


    for col in string_columns:
        if col in df.columns:
            df[col]  = df[col].astype(str)
            df [col] = df [col].str.replace('"', '', regex=False)

    for col in float_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    df[float_columns] = df[float_columns].fillna(0)

    for col in boolean_columns:
        if col in df.columns:
            df[col] = df[col].astype(bool)

    df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
    
    replace_strings = ["\\n", "\\\\", "nan", "NULL"]
    df.replace(replace_strings, "", inplace=True)

    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].str.strip()
            #df[col] = df[col].str.replace('\\\"', '', regex=False)

    for col in df.columns: # added these three lines trying to problem solve next chuck for gbq file path
        if col == 'charflag' and col in df.columns:
            df[col] = df[col].str.strip()

    df = df.applymap(lambda x: None if x == '' else x)

#duplicate code to play with

# move through all files in the directory
for root, dirs, files in os.walk(output_folder):
    for file in files:
        full_path = os.path.join(root, file)
        
        if file.endswith('.csv'):
            print(f"Found CSV file: {file}")

            delimiter = detect_delimiter(full_path)
            print(f"detected delimiter: {delimiter}")

            # reading csv with correct handling of quoted fields
            chunk_iter = pd.read_csv(full_path, delimiter=delimiter, quotechar='"', chunksize=chunk_size, dtype=str, low_memory=False)

            table_name = file.replace('.csv', '')

            # drop the table if it exists
            drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id)
            client = bigquery.Client(credentials=credentials, project=gbq_proj_id)
    
            # Construct the fully-qualified table_id without ".csv" extension
            table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

            try:
                client.delete_table(table_id)
                print(f"deleted table '{table_id}'")
            except NotFound:
                print(f"table '{table_id}' not found, skipping deletion.")

            print(f"reading csv file in chunks: {file}...")
            for idx, chunk_df in enumerate(chunk_iter):
                # clean the DF
                chunk_df = clean_dataframe(chunk_df)

                # modify the field names to comply with the gbq rules
                chunk_df.columns = [col.lower().replace(';', '') for col in chunk_df.columns]

                print(f"uploading chunk {idx + 1} to {table_name}...")
                if idx == 0:
                    # for the first chunk create the table with the defined schema
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='replace', credentials=credentials, table_schema=schema)
                else:
                    # for subsequent chunks, append to the table 
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='append', credentials=credentials)
                del chunk_df  # clean the chunk from memory
        break
    break


# move through all files in the directory
for root, dirs, files in os.walk(output_folder):
    for file in files:
        full_path = os.path.join(root, file)
        
        if file.endswith('.csv'):
            print(f"Found CSV file: {file}")

            delimiter = detect_delimiter(full_path)
            print(f"detected delimiter: {delimiter}")

            #reading csv with correct handeling of quoted fields
            chunk_iter = pd.read_csv(full_path, delimiter=delimiter,quotechar='"', chunksize=chunk_size, dtype=str, low_memory=False)

            table_name = file.replace('data.csv', '')

            #drop the table if it exists
            drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id)
            client = bigquery.Client(credentials=credentials, project=gbq_proj_id)
    
                # Construct the fully-qualified table_id
            table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

            try:
                client.delete_table(table_id)
                print(f"deleted table '{table_id}'")
            except NotFound:
                print(f"table '{table_id}' not found, skipping deletion.")

            print(f"reading csv file in chucks: {file}...")
            for idx, chunk_df in enumerate(chunk_iter):
                #clean the DF
                chunk_df = clean_dataframe(chunk_df)

                #modify the field names to comply with the gbq rules
                chunk_df.columns = [col.lower().replace(';','') for col in chunk_df.columns]

                print(f"uploading chunk {idx + 1} to {table_name}...")
                if idx == 0:
                    # for the first chuck create the table with the defined schema
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='replace', credentials=credentials, table_schema=schema)
                else:
                    # for subsuquent chunks, append to the table 
                    pandas_gbq.to_gbq(chunk_df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='append', credentials=credentials)
                del chunk_df # clean the chunk from memory
        break
    break