In [1]:
import os
import pandas as pd
import numpy as np
import pandas_gbq
import sqlite3
import zipfile
import csv
import io
import glob

# Do our imports for the code
from google.cloud import bigquery
from google.oauth2 import service_account
from zipfile import ZipFile
from google.cloud.exceptions import NotFound

## 1. Extracting Zip Files

In [2]:
# Set the directory path where your ZIP files are located locally
# Use a raw string for the path
#directory_path = "/Users/biancabostrom/Documents/ADA/Wedge Project/WedgeZipOfZips_Big"
directory_path = r'C:\Users\hills\Documents\Fall2023\ADA\wedge-project\data\WedgeZipOfZips_Big'
output_folder = 'data\\extracted_zips_big'
clean_output_folder = 'data\\clean_csvs'

## Extract CSV's from zips

In [None]:
# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate over all the files in the directory
for idx, filename in enumerate(os.listdir(directory_path)) : # JC: I added the enumerate so I can test on small samples.
    if filename.endswith('.zip'):
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)

        # Print the file path for debugging
        print(f"Attempting to extract: {file_path}")

        try:
            # Open the ZIP file
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                # Extract all the contents into the directory
                zip_ref.extractall(output_folder) # JC: so I changed this
                print(f"Extracted {filename} to {output_folder}")
        except Exception as e:
            print(f"Error extracting {filename}: {e}")

        #if idx > 10:
            #break
print("All files extracted.")

## Turn messy CSV to clean CSV

## 2. Cleaning files: headers, delimeters, nulls and quotes 

In [3]:
os.makedirs(clean_output_folder, exist_ok=True)

In [None]:
correct_headers = [
    "datetime", "register_no", "emp_no", "trans_no", "upc", "description", "trans_type", "trans_subtype",
    "trans_status", "department", "quantity", "Scale", "cost", "unitPrice", "total", "regPrice", "altPrice",
    "tax", "taxexempt", "foodstamp", "wicable", "discount", "memDiscount", "discountable", "discounttype",
    "voided", "percentDiscount", "ItemQtty", "volDiscType", "volume", "VolSpecial", "mixMatch", "matched",
    "memType", "staff", "numflag", "itemstatus", "tenderstatus", "charflag", "varflag", "batchHeaderID", 
    "local", "organic", "display", "receipt", "card_no", "store", "branch", "match_id", "trans_id"
]
# loop though all files in the directory
# JC: you did this in a bizarre way. Compare to this.

extracted_files = os.listdir(output_folder)

for file in extracted_files : 
    # Now we pick up with yours. 
    if file.endswith('.csv'):
        with open(output_folder + "/" + file,'r') as f:
            first_line = f.readline().strip()

        print(first_line)

        # Handle different delimiters
        if "datetime" in first_line:
            # Check for comma as delimiter
            if "," in first_line:
                
                messy_data = pd.read_csv(output_folder + "/" + file, sep=",")
        
            elif ";" in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep= ";")
            else:
                print(f"Neither , or ; in {file}")
        else:
            # Add headers to the file and then read it
            if "," in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep=",", header=None, names=correct_headers)
        
            elif ";" in first_line:
                messy_data = pd.read_csv(output_folder + "/" + file, sep= ";", header=None, names=correct_headers)
            else:
                print(f"Neither , or ; in {file}")

##### Testing this to remove uploading error
        #dtypes = {"memType": str}
        #messy_data = messy_data.astype(dtypes)

        #messy_data.to_csv(os.path.join(clean_output_folder, file.replace("csv", "txt")), sep="\t", index=False)

##### Testing above to remove uploading error

        messy_data.to_csv(clean_output_folder + "/" + file.replace("csv", "txt"), sep="\t", index = False)

        ### Work on null NULL \\N - replace with "" (this empty string)
        messy_data.replace(["NULL", "\\N"], "", inplace=True)  ## Chat said to put this above the above line.
        ### some files are , delimited and some product desc have , in them. PD might handle. 

        #if '"' in first_line : 
            #with open(output_folder + "/" + file,'r') as f:
                #content = f.read()
                #print(content[:1000])
                #break

In [4]:
# Bianca's info
#service_path = "/Users/biancabostrom/Documents/ADA/Wedge\ Project/wedge-404400-cb3a632effa5.json"
#service_file = 'wedge-404400-cb3a632effa5.json' 
#gbq_proj_id = "wedge-404400" 
#gbq_dataset_id = "wedge_data"
#credentials = service_account.Credentials.from_service_account_file("/Users/biancabostrom/Documents/ADA/Wedge Project/wedge-404400-cb3a632effa5.json")

# Spencer's info
service_path = r"C:\Users\hills\Documents\Fall2023\ADA\wedge-project\leafy-sunrise-403222-f51fcd80b921.json"
service_file = 'leafy-sunrise-403222-f51fcd80b921.json' # change this to your authentication information  
gbq_proj_id = "leafy-sunrise-403222" # change this to your project. 
gbq_dataset_id = "wedge_data"
credentials = service_account.Credentials.from_service_account_file(service_path)

private_key = service_path + service_file


In [5]:
# John - should I use something like this to replace some of the schema definitions?
                #df = df.clean_names()
                #df['datetime'] = pd.to_datetime(df.datetime, format='%Y-%m-%d %H:%M:%S')
                #df['department'] = df['department'].astype("str")
                #df.department = df.department.fillna('')


schema = [
    {"name": "datetime", "type": "TIMESTAMP"},     # 1
    {"name": "register_no", "type": "FLOAT"},      # 2
    {"name": "emp_no", "type": "FLOAT"},           # 3
    {"name": "trans_no", "type": "FLOAT"},         # 4
    {"name": "upc", "type": "STRING"},             # 5
    {"name": "description", "type": "STRING"},     # 6
    {"name": "trans_type", "type": "STRING"},      # 7
    {"name": "trans_subtype", "type": "STRING"},   # 8
    {"name": "trans_status", "type": "STRING"},    # 9
    {"name": "department", "type": "FLOAT"},       # 10
    {"name": "quantity", "type": "FLOAT"},         # 11
    {"name": "Scale", "type": "FLOAT"},            # 12
    {"name": "cost", "type": "FLOAT"},             # 13
    {"name": "unitPrice", "type": "FLOAT"},        # 14
    {"name": "total", "type": "FLOAT"},            # 15
    {"name": "regPrice", "type": "FLOAT"},         # 16
    {"name": "altPrice", "type": "FLOAT"},         # 17
    {"name": "tax", "type": "FLOAT"},              # 18
    {"name": "taxexempt", "type": "FLOAT"},        # 19
    {"name": "foodstamp", "type": "FLOAT"},        # 20
    {"name": "wicable", "type": "FLOAT"},          # 21
    {"name": "discount", "type": "FLOAT"},         # 22
    {"name": "memDiscount", "type": "FLOAT"},      # 23
    {"name": "discountable", "type": "FLOAT"},     # 24
    {"name": "discounttype", "type": "FLOAT"},     # 25
    {"name": "voided", "type": "FLOAT"},           # 26
    {"name": "percentDiscount", "type": "FLOAT"},  # 27
    {"name": "ItemQtty", "type": "FLOAT"},         # 28
    {"name": "volDiscType", "type": "FLOAT"},      # 29
    {"name": "volume", "type": "FLOAT"},           # 30
    {"name": "VolSpecial", "type": "FLOAT"},       # 31
    {"name": "mixMatch", "type": "FLOAT"},         # 32
    {"name": "matched", "type": "FLOAT"},          # 33
    {"name": "memType", "type": "BOOLEAN"},        # 34 changing this from BOOLEAN to STRING to test in GBQ uploading
    {"name": "staff", "type": "BOOLEAN"},          # 35
    {"name": "numflag", "type": "FLOAT"},          # 36
    {"name": "itemstatus", "type": "FLOAT"},       # 37
    {"name": "tenderstatus", "type": "FLOAT"},     # 38
    {"name": "charflag", "type": "STRING"},        # 39
    {"name": "varflag", "type": "FLOAT"},          # 40
    {"name": "batchHeaderID", "type": "BOOLEAN"},  # 41
    {"name": "local", "type": "FLOAT"},            # 42
    {"name": "organic", "type": "FLOAT"},          # 43
    {"name": "display", "type": "BOOLEAN"},        # 44
    {"name": "receipt", "type": "FLOAT"},          # 45
    {"name": "card_no", "type": "FLOAT"},          # 46
    {"name": "store", "type": "FLOAT"},            # 47
    {"name": "branch", "type": "FLOAT"},           # 48
    {"name": "match_id", "type": "FLOAT"},         # 49
    {"name": "trans_id", "type": "FLOAT"}          # 50
]


print(cleaned_data.info())
print(cleaned_data.head())


print(cleaned_data.isnull().sum())
#cleaned_data = cleaned_data.fillna("")  # Replace NaN with an appropriate value or an empty string


# Loop through all files in the clean output folder
for file in os.listdir(clean_output_folder):
    if file.endswith('.txt'):
        # Read the cleaned data from the .txt file
        cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')

        # Check for null values in the DataFrame
        null_values = cleaned_data.isnull().sum()

        # Print information about null values
        print(f"Null values in {file}:\n{null_values}")

## 3. Upload to GBQ

In [16]:
# Set up BigQuery client
client = bigquery.Client(project=gbq_proj_id, credentials=credentials)

# Loop through all files in the clean output folder
#for file in ['transArchive_201605.txt', 'transArchive_201407_201409.txt', 'transArchive_201001_201003.txt']: #os.listdir(clean_output_folder):
for file in os.listdir(clean_output_folder):
    if file.endswith('.txt'):
        # Read the cleaned data from the .txt file
        cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t', low_memory=False)



##### Adding this to force loacal to FLOAT



        cleaned_data['local'] = cleaned_data['local'].replace('\\N', pd.NA)

        #cleaned_data['local'] = cleaned_data['local'].astype(float)

        cleaned_data['local'] = pd.to_numeric(cleaned_data['local'], errors='coerce').astype(float)

##### Adding this to force loacal to FLOAT
        #schema = [
            #bigquery.SchemaField("local", "FLOAT"),  # Change "FLOAT" to the desired datatype
            # Add other schema fields as needed
            # ...
        #]
##### Adding this to force loacal to FLOAT



                # Create a BigQuery table name using the file name
        table_name = file.replace('.txt', '')

            # Create the BigQuery table
        table_ref = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"
        job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")

        try:
            # Upload data to BigQuery
            cleaned_data.to_gbq(destination_table=table_ref, project_id=gbq_proj_id, if_exists="replace")
            print(f"Data uploaded to BigQuery table: {table_name}")
        except Exception as e:
            print(f"Error uploading data to BigQuery table {table_name}: {e}")

100%|██████████| 1/1 [00:00<?, ?it/s]


Data uploaded to BigQuery table: transArchive_201001_201003


cleaned_data['charflag'] = cleaned_data['charflag'].astype(str)
cleaned_data['memType'] = cleaned_data['memType'].astype(str)

str("memType")

print(cleaned_data["memType"].dtype)
print(cleaned_data["memType"].unique())


problematic_column_name = cleaned_data.columns[32]
print(f"Problematic column name: {problematic_column_name}")


cleaned_data.shape

# Set up BigQuery client
client = bigquery.Client(project=gbq_proj_id, credentials=credentials)

# Loop through all files in the clean output folder
for file in ['transArchive_201204_201206.txt']: #os.listdir(clean_output_folder):
    if file.endswith('.txt'):
        # Read the cleaned data from the .txt file
        cleaned_data = pd.read_csv(os.path.join(clean_output_folder, file), sep='\t')

        #break
        ### TRYING this to set data type in coluns, below


                # Handle mixed data types in 'charflag' column
        cleaned_data['charflag'] = cleaned_data['charflag'].astype(str)

        # Check and handle data types as needed
        cleaned_data = cleaned_data.astype({
            "datetime": "datetime64[ns]",
            "register_no": "float64",
            # ... Specify data types for other columns as needed
        })

        ### TRYING this to set data type in coluns^


        # Create a BigQuery table name using the file name
        table_name = file.replace('.txt', '')

        # Define the BigQuery schema
        schema = [
            {"name": col, "type": cleaned_data[col].dtype.name.lower()}
            for col in cleaned_data.columns
        ]

### Or maybe something like this, but why not just define this in the above Schema? - Ask John
                schema = [
            {"name": col, "type": "STRING"} if col in datetime_columns else
            {"name": col, "type": cleaned_data[col].dtype.name.lower()}
            for col in cleaned_data.columns
        ]
# delete the above section if better to correct in the Schema...........................

        # Create the BigQuery table
        table_ref = f"{gbq_proj_id}.{gbq_dataset_id}.{table_name}"
        job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")

        try:
            # Upload data to BigQuery
            cleaned_data.to_gbq(destination_table=table_ref, project_id=gbq_proj_id, if_exists="replace")
            print(f"Data uploaded to BigQuery table: {table_name}")
        except Exception as e:
            print(f"Error uploading data to BigQuery table {table_name}: {e}")