In [1]:
import os
import pandas as pd
import numpy as np
import pandas_gbq
import sqlite3
import zipfile
import csv
import io
import glob

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account
from zipfile import ZipFile
from google.cloud.exceptions import NotFound

## 1. Extracting Zip Files

In [2]:
# Set the directory path where your ZIP files are located locally
# Use a raw string for the path
#directory_path = "/Users/biancabostrom/Documents/ADA/Wedge Project/WedgeZipOfZips_Big"
directory_path = r'C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big'
output_folder = 'data\\extracted_zips_big'
clean_output_folder = 'data\\clean_csvs'

## Extract CSV's from zips

In [3]:
# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate over all the files in the directory
for idx, filename in enumerate(os.listdir(directory_path)) : # JC: I added the enumerate so I can test on small samples.
    if filename.endswith('.zip'):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)

        # Print the file path for debugging
        print(f"Attempting to extract: {file_path}")

        try:
            # Open the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # Extract all the contents into the directory
                zip_ref.extractall(output_folder) # JC: so I changed this
                print(f"Extracted {filename} to {output_folder}")
        except Exception as e:
            print(f"Error extracting {filename}: {e}")

        #if idx > 10:
            #break
print("All files extracted.")

Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201001_201003.zip
Extracted transArchive_201001_201003.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201004_201006.zip
Extracted transArchive_201004_201006.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201007_201009.zip
Extracted transArchive_201007_201009.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201010_201012.zip
Extracted transArchive_201010_201012.zip to data\extracted_zips_big
Attempting to extract: C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big\transArchive_201101_201103.zip
Extracted transArchive_201101_201103.zip to data\extracted_zips_big
Attempting

## Turn messy CSV to clean CSV

## 2. Cleaning files: headers, delimeters, nulls and quotes 

In [4]:
os.makedirs(clean_output_folder, exist_ok=True)

In [5]:
correct_headers = [
    "datetime", "register_no", "emp_no", "trans_no", "upc", "description", "trans_type", "trans_subtype",
    "trans_status", "department", "quantity", "Scale", "cost", "unitPrice", "total", "regPrice", "altPrice",
    "tax", "taxexempt", "foodstamp", "wicable", "discount", "memDiscount", "discountable", "discounttype",
    "voided", "percentDiscount", "ItemQtty", "volDiscType", "volume", "VolSpecial", "mixMatch", "matched",
    "memType", "staff", "numflag", "itemstatus", "tenderstatus", "charflag", "varflag", "batchHeaderID", 
    "local", "organic", "display", "receipt", "card_no", "store", "branch", "match_id", "trans_id"
]
# loop though all files in the directory
# JC: you did this in a bizarre way. Compare to this.

extracted_files = os.listdir(output_folder)

for file in extracted_files : 
    # Now we pick up with yours. 
    if file.endswith('.csv'):
        with open(output_folder + "/" + file,'r') as f:
            first_line = f.readline().strip()

        print(first_line)

        # Handle different delimiters
        if "datetime" in first_line:
            # Check for comma as delimiter
            if "," in first_line:
                
                messy_data = pd.read_csv(output_folder + "/" + file, sep=",")
        
            elif ";" in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")
            else:
                print(f"Neither , or ; in {file}")
        else:
            # Add headers to the file and then read it
            if "," in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)
        
            elif ";" in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep= ";", header=None, names=correct_headers)
            else:
                print(f"Neither , or ; in {file}")

        messy_data.to_csv(clean_output_folder + "/" + file.replace("csv", "txt"), sep="\t", index = False)

        ### Work on null NULL \\N - replace with "" (this empty string)
        messy_data.replace(["NULL", "\\N"], "", inplace=True)  ## Chat said to put this above the above line.
        ### some files are , delimited and some product desc have , in them. PD might handle. 

        #if '"' in first_line : 
            #with open(output_folder + "/" + file,'r') as f:
                #content = f.read()
                #print(content[:1000])
                #break

"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"
"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","cha

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"
datetime,register_no,emp_no,trans_no,upc,description,trans_type,trans_subtype,trans_status,department,quantity,Scale,cost,unitPrice,total,regPrice,altPrice,tax,taxexempt,foodstamp,wicable,discount,memDiscount,discountable,discounttype,voided,percentDiscount,ItemQtty,volDiscType,volume,VolSpecial,mixMatch,matched,memType,staff,numflag,itemstatus,tenderstatus,charflag,varflag,batchHeaderID,local,organic,display,receipt,card_no,store,branc

  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"
"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","cha

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")


"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","charflag","varflag","batchHeaderID","local","organic","display","receipt","card_no","store","branch","match_id","trans_id"


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",")


"datetime";"register_no";"emp_no";"trans_no";"upc";"description";"trans_type";"trans_subtype";"trans_status";"department";"quantity";"Scale";"cost";"unitPrice";"total";"regPrice";"altPrice";"tax";"taxexempt";"foodstamp";"wicable";"discount";"memDiscount";"discountable";"discounttype";"voided";"percentDiscount";"ItemQtty";"volDiscType";"volume";"VolSpecial";"mixMatch";"matched";"memType";"staff";"numflag";"itemstatus";"tenderstatus";"charflag";"varflag";"batchHeaderID";"local";"organic";"display";"receipt";"card_no";"store";"branch";"match_id";"trans_id"
"datetime","register_no","emp_no","trans_no","upc","description","trans_type","trans_subtype","trans_status","department","quantity","Scale","cost","unitPrice","total","regPrice","altPrice","tax","taxexempt","foodstamp","wicable","discount","memDiscount","discountable","discounttype","voided","percentDiscount","ItemQtty","volDiscType","volume","VolSpecial","mixMatch","matched","memType","staff","numflag","itemstatus","tenderstatus","cha

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2015-12-01 07:03:06,51,94,2,TAX,Tax,A,,,0,0,0,0.0000,0.0000,0.0000,0.0000,,0,0,0,,0.0000,0.0000,0,0,0,,0,0,0,0.0000,0,0,0,,0,0,0,,,,0,,,0,3,1,3,0,7
2016-01-01 09:12:14,51,94,3,0000000000039,Wedge Scone,I, , ,8,3,0,0.5160,2.4900,7.4700,2.4900,\N,0,0,1,\N,0.0000,0.0000,1,0,0,\N,3,0,0,0.0000,0,0,0,\N,5,0,0,,\N,\N,0,0,,0,3,1,3,0,10
2016-02-01 07:16:56,51,94,3,TAX,Tax,A,,,0,0,0,0.0000,0.0000,0.2700,0.0000,\N,0,0,0,\N,0.0000,0.0000,0,0,0,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,3,1,3,0,10
2016-03-01 07:04:38,51,94,2,TAX,Tax,A,,,0,0,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,\N,0.0000,0.0000,0,0,0,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,3,1,3,0,10
2016-04-01 07:34:35,51,94,18,0,Cash,T,CA,,0,0,0,0.0000,0.0000,-5.0000,0.0000,\N,0,0,0,\N,0.0000,0.0000,0,0,0,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,49019,1,3,0,9
2016-05-01 11:23:35,51,94,113,0000000004365,BBOWL SuperSoba Chicken/Seitan,I, , ,8,1,0,0.0000,10.0000,10.0000,10.0000,\N,1,0,0,0,0.0000,0.0000,7,0,0,0.00000000,1,0,0,0.0000,0,

  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-06-01 08:04:44,51,94,41,0000000000151,Banana Organic,I, , ,2,1.45,1,0.8900,1.1900,1.7300,1.1900,\N,0,0,1,1,0.0000,0.0000,1,0,0,10.00000000,1.45,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,1,,0,12367,1,3,0,2
2016-07-01 07:06:15,51,94,1,0065722700050,Electrolyte Water 1.5L Essenti,I, , ,1,3,0,1.5800,2.6900,8.0700,2.6900,\N,0,0,1,0,0.0000,0.0000,1,0,0,0.00000000,3,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,0,,0,3,1,3,0,1


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-08-01 07:34:16,51,94,7,0000000000151,Banana Organic,I, , ,2,0.52,1,0.8900,1.1900,0.6200,1.1900,\N,0,0,1,1,0.0000,0.0000,1,0,0,0.00000000,0.52,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,1,,0,21998,1,3,0,2


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-09-01 07:13:09,51,94,6,0,Change,T,CA,,0,0,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,0,0.0000,0.0000,0,0,8,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,20074,1,3,0,7


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-10-01 07:04:40,51,94,1,DISCOUNT,Discount,I,,,0,1,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,0,0.0000,0.0000,0,0,0,\N,1,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,49355,1,3,0,10


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2016-11-01 07:18:44,51,94,11,0000000001014,Green Patch Redemption,I, , ,1,1,0,0.0000,-0.1000,-0.1000,-0.1000,\N,0,0,0,0,0.0000,0.0000,0,0,0,10.00000000,1,0,0,0.0000,0,0,0,\N,4,0,0,,\N,\N,0,-1,,0,16646,1,3,0,13
2016-12-01 07:43:01,51,94,23,0000000000049,Wedge Muffin,I, , ,8,1,0,0.6350,2.4900,2.4900,2.4900,\N,0,0,1,0,0.0000,0.0000,7,0,0,10.00000000,1,0,0,0.0000,0,0,0,\N,5,0,0,,\N,\N,0,0,,0,13863,1,3,0,2


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


2017-01-01 09:00:31,51,94,12,0,Change,T,CA,,0,0,0,0.0000,0.0000,0.0000,0.0000,\N,0,0,0,0,0.0000,0.0000,0,0,8,\N,0,0,0,0.0000,0,0,0,\N,0,0,0,,\N,\N,0,\N,,0,24528,1,3,0,12


  messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)


In [8]:
# Bianca's info
#service_path = "/Users/biancabostrom/Documents/ADA/Wedge\ Project/wedge-404400-cb3a632effa5.json"
#service_file = 'wedge-404400-cb3a632effa5.json' 
#gbq_proj_id = "wedge-404400" 
#gbq_dataset_id = "wedge_data"
#credentials = service_account.Credentials.from_service_account_file("/Users/biancabostrom/Documents/ADA/Wedge Project/wedge-404400-cb3a632effa5.json")

# Spencer's info
service_path = r"C:\Users\hills\Documents\Fall2023\ADA\wedge-project\leafy-sunrise-403222-f51fcd80b921.json"
service_file = 'leafy-sunrise-403222-f51fcd80b921.json' # change this to your authentication information  
gbq_proj_id = "leafy-sunrise-403222" # change this to your project. 
gbq_dataset_id = "wedge_data"
credentials = service_account.Credentials.from_service_account_file(service_path)

private_key = service_path + service_file


In [9]:
schema = [
    {"name": "datetime", "type": "TIMESTAMP"},     # 1
    {"name": "register_no", "type": "FLOAT"},      # 2
    {"name": "emp_no", "type": "FLOAT"},           # 3
    {"name": "trans_no", "type": "FLOAT"},         # 4
    {"name": "upc", "type": "STRING"},             # 5
    {"name": "description", "type": "STRING"},     # 6
    {"name": "trans_type", "type": "STRING"},      # 7
    {"name": "trans_subtype", "type": "STRING"},   # 8
    {"name": "trans_status", "type": "STRING"},    # 9
    {"name": "department", "type": "FLOAT"},       # 10
    {"name": "quantity", "type": "FLOAT"},         # 11
    {"name": "Scale", "type": "FLOAT"},            # 12
    {"name": "cost", "type": "FLOAT"},             # 13
    {"name": "unitPrice", "type": "FLOAT"},        # 14
    {"name": "total", "type": "FLOAT"},            # 15
    {"name": "regPrice", "type": "FLOAT"},         # 16
    {"name": "altPrice", "type": "FLOAT"},         # 17
    {"name": "tax", "type": "FLOAT"},              # 18
    {"name": "taxexempt", "type": "FLOAT"},        # 19
    {"name": "foodstamp", "type": "FLOAT"},        # 20
    {"name": "wicable", "type": "FLOAT"},          # 21
    {"name": "discount", "type": "FLOAT"},         # 22
    {"name": "memDiscount", "type": "FLOAT"},      # 23
    {"name": "discountable", "type": "FLOAT"},     # 24
    {"name": "discounttype", "type": "FLOAT"},     # 25
    {"name": "voided", "type": "FLOAT"},           # 26
    {"name": "percentDiscount", "type": "FLOAT"},  # 27
    {"name": "ItemQtty", "type": "FLOAT"},         # 28
    {"name": "volDiscType", "type": "FLOAT"},      # 29
    {"name": "volume", "type": "FLOAT"},           # 30
    {"name": "VolSpecial", "type": "FLOAT"},       # 31
    {"name": "mixMatch", "type": "FLOAT"},         # 32
    {"name": "matched", "type": "FLOAT"},          # 33
    {"name": "memType", "type": "BOOLEAN"},        # 34
    {"name": "staff", "type": "BOOLEAN"},          # 35
    {"name": "numflag", "type": "FLOAT"},          # 36
    {"name": "itemstatus", "type": "FLOAT"},       # 37
    {"name": "tenderstatus", "type": "FLOAT"},     # 38
    {"name": "charflag", "type": "STRING"},        # 39
    {"name": "varflag", "type": "FLOAT"},          # 40
    {"name": "batchHeaderID", "type": "BOOLEAN"},  # 41
    {"name": "local", "type": "FLOAT"},            # 42
    {"name": "organic", "type": "FLOAT"},          # 43
    {"name": "display", "type": "BOOLEAN"},        # 44
    {"name": "receipt", "type": "FLOAT"},          # 45
    {"name": "card_no", "type": "FLOAT"},          # 46
    {"name": "store", "type": "FLOAT"},            # 47
    {"name": "branch", "type": "FLOAT"},           # 48
    {"name": "match_id", "type": "FLOAT"},         # 49
    {"name": "trans_id", "type": "FLOAT"}          # 50
]


In [13]:
print(cleaned_data.info())
print(cleaned_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2998330 entries, 0 to 2998329
Data columns (total 50 columns):
 #   Column           Dtype  
---  ------           -----  
 0   datetime         object 
 1   register_no      int64  
 2   emp_no           int64  
 3   trans_no         int64  
 4   upc              object 
 5   description      object 
 6   trans_type       object 
 7   trans_subtype    object 
 8   trans_status     object 
 9   department       int64  
 10  quantity         float64
 11  Scale            int64  
 12  cost             float64
 13  unitPrice        float64
 14  total            float64
 15  regPrice         float64
 16  altPrice         float64
 17  tax              int64  
 18  taxexempt        int64  
 19  foodstamp        int64  
 20  wicable          int64  
 21  discount         float64
 22  memDiscount      float64
 23  discountable     int64  
 24  discounttype     int64  
 25  voided           int64  
 26  percentDiscount  float64
 27  ItemQtty    

In [14]:
print(cleaned_data.isnull().sum())
#cleaned_data = cleaned_data.fillna("")  # Replace NaN with an appropriate value or an empty string


datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype       389327
trans_status        673703
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt                0
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1094857
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2998330
staff                    0
numflag                  0
itemstatus               0
t

In [17]:
# Loop through all files in the clean output folder
for file in os.listdir(clean_output_folder):
    if file.endswith('.txt'):
        # Read the cleaned data from the .txt file
        cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')

        # Check for null values in the DataFrame
        null_values = cleaned_data.isnull().sum()

        # Print information about null values
        print(f"Null values in {file}:\n{null_values}")

Null values in transArchive_201001_201003.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype       389327
trans_status        673703
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt                0
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1094857
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2998330
staff                    0
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201010_201012.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype            0
trans_status             0
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             6536
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount     989208
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2957586
staff              2092767
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201201_201203_inactive.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype           0
trans_status            0
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt            1520
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount     73888
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                25
memType            245772
staff              245747
numflag                 0
itemstat

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201204_201206.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype            0
trans_status             0
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             9035
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1056601
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            3083257
staff              3083324
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201204_201206_inactive.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype           0
trans_status            0
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt            1273
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount     74456
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                26
memType            237990
staff              237964
numflag                 0
itemstat

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201207_201209.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype            6
trans_status             5
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             8264
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1025624
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2925397
staff              2925397
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201207_201209_inactive.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype           0
trans_status            0
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt             699
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  2
percentDiscount     62524
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                17
memType            190858
staff              190841
numflag                 0
itemstat

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201210_201212.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype           11
trans_status            11
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             7751
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1004997
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2893357
staff              2893353
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201210_201212_inactive.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype           0
trans_status            0
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt            1108
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  2
percentDiscount     50615
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                25
memType            162963
staff              162938
numflag                 0
itemstat

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201301_201303.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype           12
trans_status            12
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             7936
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount     978586
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2902334
staff              2902334
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201301_201303_inactive.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype           0
trans_status            0
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt            1015
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  3
percentDiscount     45072
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                19
memType            148613
staff              148594
numflag                 0
itemstat

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201304_201306.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype            0
trans_status             0
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             8767
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1055287
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            3022733
staff              3022733
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201304_201306_inactive.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype           0
trans_status            0
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     2
taxexempt            1182
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  1
percentDiscount     43094
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                14
memType            137609
staff              137595
numflag                 2
itemstat

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201307_201309.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype          606
trans_status           606
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             8943
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1110254
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2995212
staff              2995212
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201307_201309_inactive.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype           0
trans_status            0
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     1
taxexempt             655
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount     36264
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                12
memType            104468
staff              104456
numflag                 1
itemstat

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201310_201312.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype         9861
trans_status         17610
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             8725
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1027350
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2919084
staff              2918944
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201401_201403.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype       408476
trans_status        761447
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             8334
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1003032
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            2915435
staff              2911809
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201401_201403_inactive.txt:
datetime               0
register_no            0
emp_no                 0
trans_no               0
upc                    0
description            0
trans_type             0
trans_subtype       7146
trans_status       13072
department             0
quantity               0
Scale                  0
cost                   0
unitPrice              0
total                  0
regPrice               0
altPrice               0
tax                    0
taxexempt            617
foodstamp              0
wicable                0
discount               0
memDiscount            0
discountable           0
discounttype           0
voided                 0
percentDiscount    16815
ItemQtty               0
volDiscType            0
volume                 0
VolSpecial             0
mixMatch               0
matched                4
memType            52614
staff              52604
numflag                0
itemstatus           617
tenderstatus       

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201404_201406.txt:
datetime                 0
register_no              0
emp_no                   0
trans_no                 0
upc                      0
description              0
trans_type               0
trans_subtype       508431
trans_status        957778
department               0
quantity                 0
Scale                    0
cost                     0
unitPrice                0
total                    0
regPrice                 0
altPrice                 0
tax                      0
taxexempt             8980
foodstamp                0
wicable                  0
discount                 0
memDiscount              0
discountable             0
discounttype             0
voided                   0
percentDiscount    1162829
ItemQtty                 0
volDiscType              0
volume                   0
VolSpecial               0
mixMatch                 0
matched                  0
memType            3154260
staff              3151813
numflag 

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201511.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      165493
trans_status       323577
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                 0
staff                   0
numflag                 0
itemstatus              

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201605.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      166520
trans_status       322132
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                 0
staff                   0
numflag                 0
itemstatus              

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201607.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      156503
trans_status       307368
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                 0
staff                   0
numflag                 0
itemstatus              

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201608.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      157265
trans_status       307431
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                 0
staff                   0
numflag                 0
itemstatus              

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201609.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      156581
trans_status       305109
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                10
staff                   0
numflag                 0
itemstatus              

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201610.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      162006
trans_status       316679
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                 0
staff                   0
numflag                 0
itemstatus              

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201612.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      150088
trans_status       301049
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                 0
staff                   0
numflag                 0
itemstatus              

  cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


Null values in transArchive_201701.txt:
datetime                0
register_no             0
emp_no                  0
trans_no                0
upc                     0
description             0
trans_type              0
trans_subtype      156928
trans_status       307339
department              0
quantity                0
Scale                   0
cost                    0
unitPrice               0
total                   0
regPrice                0
altPrice                0
tax                     0
taxexempt               0
foodstamp               0
wicable                 0
discount                0
memDiscount             0
discountable            0
discounttype            0
voided                  0
percentDiscount         0
ItemQtty                0
volDiscType             0
volume                  0
VolSpecial              0
mixMatch                0
matched                 0
memType                 0
staff                   0
numflag                 0
itemstatus              

In [15]:
cleaned_data['charflag'] = cleaned_data['charflag'].astype(str)


In [16]:
# Set up BigQuery client
client = bigquery.Client(project=gbq_proj_id, credentials=credentials)

# Loop through all files in the clean output folder
for file in os.listdir(clean_output_folder):
    if file.endswith('.txt'):
        # Read the cleaned data from the .txt file
        cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')


        ### TRYING this to set data type in coluns, below


                # Handle mixed data types in 'charflag' column
        cleaned_data['charflag'] = cleaned_data['charflag'].astype(str)

        # Check and handle data types as needed
        cleaned_data = cleaned_data.astype({
            "datetime": "datetime64",
            "register_no": "float64",
            # ... Specify data types for other columns as needed
        })

        ### TRYING this to set data type in coluns^


        # Create a BigQuery table name using the file name
        table_name = file.replace('.txt', '')

        # Define the BigQuery schema
        schema = [
            {"name": col, "type": cleaned_data[col].dtype.name.lower()}
            for col in cleaned_data.columns
        ]

        # Create the BigQuery table
        table_ref = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"
        job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")

        try:
            # Upload data to BigQuery
            cleaned_data.to_gbq(destination_table=table_ref, project_id=gbq_proj_id, if_exists="replace")
            print(f"Data uploaded to BigQuery table: {table_name}")
        except Exception as e:
            print(f"Error uploading data to BigQuery table {table_name}: {e}")

TypeError: Casting to unit-less dtype 'datetime64' is not supported. Pass e.g. 'datetime64[ns]' instead.

## 3. Upload to GBQ

# Move through all files in the directory
for root, dirs, files in os.walk(clean_output_folder):
    for file in files:
        full_path = os.path.join(root, file)

        if file.endswith('.txt'):
            print(f"Found TXT file: {file}")

            delimiter = detect_delimiter(full_path)
            print(f"Detected delimiter: {delimiter}")

            # Reading TXT with correct handling of quoted fields
            df = pd.read_csv(full_path, delimiter=delimiter, quotechar='"', dtype=str, low_memory=False)

            table_name = file.replace('.txt', '')

            # Drop the table if it exists
            drop_table_if_exists(gbq_dataset_id, table_name, credentials, gbq_proj_id)
            client = bigquery.Client(credentials=credentials, project=gbq_proj_id)

            # Construct the fully-qualified table_id without ".txt" extension
            table_id = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"

            try:
                client.delete_table(table_id)
                print(f"Deleted table '{table_id}'")
            except NotFound:
                print(f"Table '{table_id}' not found, skipping deletion.")

            # Explicitly convert columns to the correct data types
            # Adjust these conversions based on your actual column names and data types
            #df['column_name_1'] = pd.to_numeric(df['column_name_1'], errors='coerce')
            #df['column_name_2'] = pd.to_numeric(df['column_name_2'], errors='coerce')
            # Repeat the above line for any columns causing the conversion error

            # Clean the DataFrame
            df = clean_dataframe(df)

            # Modify the field names to comply with the gbq rules
            df.columns = [col.lower().replace(';', '') for col in df.columns]

            print(f"Uploading {table_name} to BigQuery...")
            pandas_gbq.to_gbq(df, f"{gbq_dataset_id}.{table_name}", project_id=gbq_proj_id, if_exists='replace', credentials=credentials, table_schema=schema)
            del df  # Clean the DataFrame from memory


# Hi John - cells below are for reference 