In [2]:
import pandas as pd
from database_utils import DatabaseConnector
import re 
from sqlalchemy import create_engine, text, insert 
from sqlalchemy.inspection import inspect
from sqlalchemy.exc import SQLAlchemyError

In [3]:
#START HERE FROM 22th JULY 



In [35]:
# CURRENT CODE  


# HELPER FUNCTIONS 

# Create function to fetch data from the table in my local SQL server so that I can get the data to perform checks on it 
def fetch_data(connection, table_name, limit=5):
    fetch_data_query = f"SELECT * FROM {table_name} LIMIT {limit};"
    result = connection.execute(text(fetch_data_query))
    return result.fetchall()

# Function to check the column data type to check if the column type has been correctly converted 
def check_column_type(connection, table_name, column_name):
    check_column_type_query = f"""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = '{table_name}' AND column_name = '{column_name}';
    """
    result = connection.execute(text(check_column_type_query))
    return result.fetchone()

# Function to determine the maximum length of values in the column
def get_max_length(connection, table_name, column_name):
    max_length_sql = f"""
    SELECT MAX(LENGTH(CAST({column_name} AS TEXT))) 
    FROM {table_name};
    """
    result = connection.execute(text(max_length_sql)).fetchone()
    return result[0]

def remove_pound_symbol(connection, table_name, column_name):
    remove_pound_sql = f"""
    UPDATE {table_name}
    SET {column_name} = REPLACE({column_name}, '£', '')
    WHERE {column_name} LIKE '£%';
    """
    connection.execute(text(remove_pound_sql))

def add

# CLEANING FUNCTIONS 

# Create function to clean uuid with regex #working regex:   WHERE TRIM(CAST({column_name} AS TEXT)) !~* '^[a-f0-9\\-]+$';
def clean_uuid(connection, table_name, column_name):
    clean_uuid = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE TRIM(CAST({column_name} AS TEXT)) !~* '^[a-f0-9]{{8}}-[a-f0-9]{{4}}-[a-f0-9]{{4}}-[a-f0-9]{{4}}-[a-f0-9]{{12}}$';
    """  
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(clean_uuid))

# 9476f17e-5d6a-4117-874d-9cdb38ca1fa6

# Create function to clean numeric data (both integer and floats) data with regex by ensuring they are numbers 
def clean_numbers(connection, table_name, column_name):
    clean_numbers_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[-]?[0-9]*\\.?[0-9]+$';
    """
    connection.execute(text(clean_numbers_sql))

# Create function to clean store_code and product_code data with regex (e.g. store_code 'BL-8387506C', product_code 'R7-3126933h')
def clean_store_or_product_codes(connection, table_name, column_name):
    clean_numbers_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[A-Za-z0-9]+-[A-Za-z0-9]+$';
    """
    connection.execute(text(clean_numbers_sql))

# cleaning text data  
def clean_text_data(connection, table_name, column_name):
    clean_text_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[A-Za-z]+$';
    """
    connection.execute(text(clean_text_sql))

def clean_date_data(connection, table_name, column_name):
    clean_date_sql = f"""
    UPDATE {table_name}
    SET {column_name} = 
    TO_DATE(
        REGEXP_REPLACE(
            CAST({column_name} AS TEXT), 
            '\\((\\d+), (\\d+), (\\d+), \\d+, \\d+\\)', 
            '\\1-\\2-\\3'
        ), 
        'YYYY-MM-DD'
    )
    WHERE CAST({column_name} AS TEXT) ~ '\\(\\d+, \\d+, \\d+, \\d+, \\d+\\)';
    """
    connection.execute(text(clean_date_sql))

# CONVERTING FUNCTIONS  

# Create function to convert a specified column to UUID 
def convert_to_uuid(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE UUID
        USING {column_name}::UUID;
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# Function to convert the column to VARCHAR with the determined maximum length // goes after USING CAST: ALTER COLUMN {column_name} DROP NOT NULL;
def convert_to_varchar(connection, table_name, column_name, length): 
    convert_to_var_sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE VARCHAR({length}) USING CAST({column_name} AS VARCHAR({length})),
    ALTER COLUMN {column_name} DROP NOT NULL;
    """
    connection.execute(text(convert_to_var_sql))

# Create function to convert bigint to smalint 
def convert_to_smallint(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE SMALLINT
        USING CAST({column_name} AS SMALLINT);
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# Create function to convert to date 
def convert_to_date(connection, table_name, column_name): 
    convert_date_sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE DATE
    USING {column_name}::DATE;
    """
    connection.execute(text(convert_date_sql))

# Create function to convert date to  to smalint 
def convert_to_float(connection, table_name, column_name): 
    convert_date_sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE FLOAT
    USING {column_name}::FLOAT;
    """
    connection.execute(text(convert_date_sql))

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
    
        with connection.begin():  # Ensure the transaction is committed
            
            # Check and print column type before conversion
            column_type_before = check_column_type(connection, 'dim_store_details', 'store_type')
            print(f"Column type before conversion: {column_type_before}")
            
            #put the attempt to run the functions in a try block 
            try:
                # Cleaning then converting the date_uuid
                clean_uuid(connection, 'orders_table', 'date_uuid')
                #print('Clean UUID worked')
                convert_to_uuid(connection, 'orders_table', 'date_uuid')
                #print('Convert to UUID worked')
                
                # Cleaning then converting the user_uuid
                clean_uuid(connection, 'orders_table', 'user_uuid')
                #print('Clean UUID worked')
                convert_to_uuid(connection, 'orders_table', 'user_uuid')
                #print('Convert to UUID worked')
                
                # Cleaning then converting the card_number
                clean_numbers(connection, 'orders_table', 'card_number')
                #print('Clean card numbers worked')
                max_length = get_max_length(connection, 'orders_table', 'card_number')
                convert_to_varchar(connection, 'orders_table', 'card_number', max_length)
                #print('Convert card numbers worked')

                # Cleaning then converting store_code
                clean_store_or_product_codes(connection, 'orders_table', 'store_code')
                max_length = get_max_length(connection, 'orders_table', 'store_code')
                #print('clean store_code worked)')
                #print(max_length)
                convert_to_varchar(connection, 'orders_table', 'store_code', max_length)
                #print('Convert store_code worked')

                # Cleaning then converting product_code
                clean_store_or_product_codes(connection, 'orders_table', 'product_code')
                max_length = get_max_length(connection, 'orders_table', 'product_code')
                #print('clean product_code worked)')
                #print(max_length)
                convert_to_varchar(connection, 'orders_table', 'product_code', max_length)
                #print('Convert product_code worked')

                # Cleaning then converting the product_quantity
                clean_numbers(connection, 'orders_table', 'product_quantity')
                #print('Clean product quantity has worked')
                convert_to_smallint(connection, 'orders_table', 'product_quantity')
                #print('Convert product quantity has worked')

                # Cleaning then converting first_name
                clean_text_data(connection, 'dim_users', 'first_name')
                #print('Clean first_name has worked')
                convert_to_varchar(connection, 'dim_users', 'first_name', 255)
                #print('Convert first_name worked')

                # Cleaning then converting first_name
                clean_text_data(connection, 'dim_users', 'last_name')
                #print('Clean last_name has worked')
                convert_to_varchar(connection, 'dim_users', 'last_name', 255)
                #print('Convert last_name worked')

                # Cleaning then converting date_of_birth
                clean_date_data(connection, 'dim_users', 'date_of_birth')
                #print('Date data cleaning worked')
                convert_to_date(connection, 'dim_users', 'date_of_birth')
                #print('Conversion to date worked')

                # Cleaning then converting country_code 
                clean_text_data(connection, 'dim_users', 'country_code')
                #print('clean country_code worked')
                max_length = get_max_length(connection, 'dim_users', 'country_code')
                #print('this is ', max_length)
                convert_to_varchar(connection, 'dim_users', 'country_code', max_length)
                #print('Convert country_code worked')

                # Cleaning then converting the user_uuid
                clean_uuid(connection, 'dim_users', 'user_uuid')
                #print('Clean UUID worked')
                convert_to_uuid(connection, 'dim_users', 'user_uuid')
                #print('Convert to UUID worked')

                # Cleaning then converting date_of_birth
                clean_date_data(connection, 'dim_users', 'join_date')
                #print('join_date cleaning worked')
                convert_to_date(connection, 'dim_users', 'join_date')
                #print('Conversion of join_date worked')
                
                # Cleaning then converting longitude
                clean_numbers(connection, 'dim_store_details', 'longitude')
                #print('longitude cleaning worked')
                convert_to_float(connection, 'dim_store_details', 'longitude')
                #print('Conversion of longitude worked')

                # Cleaning then converting locality 
                clean_text_data(connection, 'dim_store_details', 'locality')
                #print('Clean locality has worked')
                convert_to_varchar(connection, 'dim_store_details', 'locality', 255)
                #print('Convert locality worked')

                # Cleaning then converting store_code 
                clean_store_or_product_codes(connection, 'dim_store_details', 'store_code')
                #print('clean store_code worked')
                max_length = get_max_length(connection, 'dim_store_details', 'store_code')
                #print('this is ', max_length)
                convert_to_varchar(connection, 'dim_store_details', 'store_code', max_length)
                #print('Convert store_code worked')

                # Cleaning then converting staff_numbers
                clean_numbers(connection, 'dim_store_details', 'staff_numbers')
                #print('staff_numbers cleaning worked')
                convert_to_smallint(connection, 'dim_store_details', 'staff_numbers')
                #print('Conversion of staff_numbers worked')
                
                # Cleaning then converting opening_date
                clean_date_data(connection, 'dim_store_details', 'opening_date')
                #print('opening_date cleaning worked')
                convert_to_date(connection, 'dim_store_details', 'opening_date')
                #print('Conversion of opening_date worked')

                # Cleaning then converting locality 
                clean_text_data(connection, 'dim_store_details', 'store_type')
                #print('Clean store_type has worked')
                convert_to_varchar(connection, 'dim_store_details', 'store_type', 255)
                #print('Convert store_type worked')

                # Cleaning then converting longitude
                clean_numbers(connection, 'dim_store_details', 'latitude')
                #print('latitude cleaning worked')
                convert_to_float(connection, 'dim_store_details', 'latitude')
                #print('Conversion of latitude worked')

                # Cleaning then converting country_code 
                clean_text_data(connection, 'dim_store_details', 'country_code')
                #print('clean country_code worked')
                max_length = get_max_length(connection, 'dim_store_details', 'country_code')
                convert_to_varchar(connection, 'dim_store_details', 'country_code', max_length)
                #print('Convert country_code worked')

                # Cleaning then converting continent 
                clean_text_data(connection, 'dim_store_details', 'continent')
                #print('Clean continent has worked')
                convert_to_varchar(connection, 'dim_store_details', 'continent', 255)
                #print('Convert continent worked')

                # Removing pound from price column
                remove_pound_symbol(connection, 'dim_products', 'product_price')
                


            except SQLAlchemyError as e:
                print(f"An error occurred: {e}")

            # Check and print column type after conversion
            column_type_after = check_column_type(connection, 'dim_store_details', 'store_type')
            print(f"Column type after conversion: {column_type_after}")

    print('End of call')

run_all_operations()


init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!
Column type before conversion: ('store_type', 'character varying')
Column type after conversion: ('store_type', 'character varying')
End of call


In [6]:
#THIS IS CODE SO THAT I CAN SEE WHAT THE TABLE LOOKS LIKE 
def view_data(): 

    instance = DatabaseConnector() 

    engine = instance.init_my_db_engine()


    # Define the query to display column headers and first few rows :: dim_store_details
    query = "SELECT * FROM dim_store_details LIMIT 10;" 

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        result = connection.execute(text(query))
        # Print the column headers
        display(result.keys())
        # Print each row
        for row in result:
            display(row)

view_data()

init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!


RMKeyView(['index', 'address', 'longitude', 'locality', 'store_code', 'staff_numbers', 'opening_date', 'store_type', 'latitude', 'country_code', 'continent'])

(2, 'Heckerstraße 4/5\n50491 Säckingen, Landshut', 48.52961, 'Landshut', 'LA-0772C7B9', 92, datetime.date(2013, 4, 12), 'Super Store', 12.16179, 'DE', 'Europe')

(3, '5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury', 51.26, 'Westbury', 'WE-1DE82CEE', 69, datetime.date(2014, 1, 2), 'Super Store', -2.1875, 'GB', 'Europe')

(4, 'Studio 6\nStephen landing\nSouth Simon\nB77 2WA, Belper', 53.0233, 'Belper', 'BE-18074576', 35, datetime.date(2019, 9, 9), 'Local', -1.48119, 'GB', 'Europe')

(5, 'Flat 92u\nChristian harbors\nPort Charlotte\nN57 8FJ, Gainsborough', 53.38333, 'Gainsborough', 'GA-CAD01AC2', 36, datetime.date(1995, 5, 15), 'Local', -0.76667, 'GB', 'Europe')

(6, '7 Gillian rue\nWest Robertside\nPH4 8NY, Rutherglen', 55.82885, 'Rutherglen', 'RU-C603E990', 92, datetime.date(2001, 1, 4), 'Super Store', -4.21376, 'GB', 'Europe')

(7, 'Lilija-Heß-Allee 660\n34566 Regensburg, Stuttgart', 48.78232, 'Stuttgart', 'ST-229D997E', 34, datetime.date(2000, 6, 1), 'Local', 9.17702, 'DE', 'Europe')

(8, '510 Jill Mill\nSouth Laura, FL 38723, Kaukauna', 44.27804, 'Kaukauna', 'KA-FA7ED3B8', 31, datetime.date(2022, 9, 5), 'Local', -88.27205, 'US', 'America')

(9, '3 Lee valleys\nWest Janetview\nDY4M 2RL, Hartley', 51.38673, 'Hartley', 'HA-974352FE', 20, datetime.date(2004, 9, 11), 'Local', 0.30367, 'GB', 'Europe')

(12, 'Flat 37\nBennett expressway\nNew Charlotte\nSY8R 5WE, Devizes', 51.35084, 'Devizes', 'DE-585399CF', 36, datetime.date(2014, 10, 11), 'Local', -1.99421, 'GB', 'Europe')

(14, 'Herma-Rädel-Gasse 29\n74557 Fulda, Halstenbek', 53.63333, 'Halstenbek', 'HA-39A446E2', 38, datetime.date(1994, 3, 8), 'Local', 9.85, 'DE', 'Europe')

In [66]:
#CHECK THAT ALL NAMES IN FIRST NAME COLUMN ARE ALPHABETIC 

def view_data(): 
    # Create engine 
    instance = DatabaseConnector() 

    engine = instance.init_my_db_engine()


    # Define the query to display column headers and first few rows
    query = "SELECT first_name FROM dim_users LIMIT 5;"

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        result = connection.execute(text(query))
        # Print the column headers
        display(result.keys())
        # Initialise the list to store the odd names 
        odd_names = []
        # create the regex pattern
        pattern = re.compile('^[A-Za-z]+$')
        # iterate through each row in the result 
        for row in result:
            # Access the first element of the tuple to get the name
            name = row[0]
            if not pattern.match(name):
                odd_names.append(name)
                 
        #display(row)
        display(odd_names)
view_data()



init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!


RMKeyView(['first_name'])

[]

In [39]:
# Step 1: Initial import
from data_cleaning import DataCleaning
from data_extraction import DataExtractor

# creating instances 
extractor_instance = DataExtractor()
cleaning_instance = DataCleaning()

# get original dataframe 
raw_df = extractor_instance.retrieve_stores_data() 

# cleaning data
#clean_store_df = cleaning_instance.cleaning_store_details()


retrieve_stores_data is working
read_api_key is working


In [40]:
display(raw_df)

Unnamed: 0,index,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
0,0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,GB,Europe
1,1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...
446,446,"Täschestraße 25\n39039 Nördlingen, Kirchlengern",52.2,,Kirchlengern,KI-78096E8C,61,2005-05-12,Super Store,8.63333,DE,Europe
447,447,K0ODETRLS3,K8CXLZDP07,UXMWDMX1LC,3VHFDNP8ET,9D4LK7X4LZ,D23PCWSM6S,36IIMAQD58,NN04B3F6UQ,JZP8MIJTPZ,B3EH2ZGQAV,1WZB1TE1HL
448,448,"Studio 8\nMoss mall\nWest Linda\nM0E 6XR, High...",51.62907,,High Wycombe,HI-EEA7AE62,33,1998-05-14,Local,-0.74934,GB,Europe
449,449,"Baumplatz 6\n80114 Kötzting, Bretten",49.03685,,Bretten,BR-662EC74C,35,2020-10-17,Local,8.70745,DE,Europe


In [59]:
import numpy as np
from database_utils import DatabaseConnector

# LOGGING 
print('Started cleaning_store_details')

#retrieving the data from the stores API
df = raw_df

display(df.head(4)) 

# Clean the latitude column
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df = df.dropna(subset=['latitude'])

# Dropping the 'latitude' column as it's empty 
df = df.drop(columns=['lat'])

# filtering out items in locality that aren't real place names or NULL 
pattern = r'^[a-zA-Z\s-]+$'
df = df[df['locality'].str.match(pattern)]
df['locality'] = df['locality'].replace('NULL', np.nan)
df = df.dropna(subset=['locality'])

display(df.head(4))  

# replacing incorrect spellings of continents 
continent_replacements = {
    'eeEurope': 'Europe',
    'eeAmerica': 'America'
}

df['continent'] = df['continent'].replace(continent_replacements)

# LOGGING: checking everything has worked 
#print(df['continent'].unique())
#print(df['store_type'].unique()) 
#print(df['country_code'].unique()) 
#print(df['continent'].unique())

# LOGGING: checking of datetime before is datetime64 datetype  
#is_datetime_before = pd.api.types.is_datetime64_any_dtype(df['opening_date'])

# converting opening date to datetime object 
df['opening_date'] = pd.to_datetime(df['opening_date'], format='%Y-%m-%d', errors='coerce')

# dropping any rows which contain missing values  
df = df.dropna(axis=0)

# Filter the DataFrame to keep only rows where 'latitude' is not NaN
non_null_rows = df[df['opening_date'].isna()]  

# Convert the filtered DataFrame to a list of rows
non_null_list = non_null_rows.values.tolist()

display(len(non_null_list))
display(non_null_list)
# LOGGING: checking the conversion worked 
#is_datetime_after = pd.api.types.is_datetime64_any_dtype(df['opening_date'])
#print(f"Is 'dates' column datetime64 dtype? {is_datetime_before}")
#print(f"Is 'dates' column datetime64 dtype? {is_datetime_after}")
display('last', df.head(4)) 
#returning the dataframe 
df.info()  

databaseconnector_instance = DatabaseConnector() 

databaseconnector_instance.upload_to_db(df, 'dim_store_details')

Started cleaning_store_details


Unnamed: 0,index,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
0,0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,GB,Europe
1,1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe


Unnamed: 0,index,address,longitude,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
1,1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe


0

[]

'last'

Unnamed: 0,index,address,longitude,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
1,1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe


<class 'pandas.core.frame.DataFrame'>
Index: 428 entries, 1 to 450
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   index          428 non-null    int64         
 1   address        428 non-null    object        
 2   longitude      428 non-null    object        
 3   locality       428 non-null    object        
 4   store_code     428 non-null    object        
 5   staff_numbers  428 non-null    object        
 6   opening_date   428 non-null    datetime64[ns]
 7   store_type     428 non-null    object        
 8   latitude       428 non-null    float64       
 9   country_code   428 non-null    object        
 10  continent      428 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(8)
memory usage: 40.1+ KB
upload_to_db is working
init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!
Table 'dim_store_det

#Ignore below 


In [None]:
#BACKUP OF CHATGPT UUID CODE WITH CHECKS 

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Fetch and print data before conversion
        print("Data before conversion:")
        data_before = fetch_data(connection, 'orders_table')
        for row in data_before:
            print(row)
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            print('Convert to UUID worked')
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Fetch and print data after conversion
        print("Data after conversion:")
        data_after = fetch_data(connection, 'orders_table')
        for row in data_after:
            print(row)
        
        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')


In [None]:
#THINK THIS CAN BE GOTTEN RID OF 
#def cast_data(): 
  
#UUID 
# Create function to clean uuid with regex 
def clean_uuid(connection, table_name, column_name):
    clean_uuid = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
        """  
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(clean_uuid))

# Create function to update specified column to UUID 
def convert_data_uuid(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE UUID
        USING {column_name}::UUID;
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('clean uuid worked')
            #max_length = get_max_length(connection, table_name, column_name)
            #convert_numbers(connection, table_name, column_name, max_length)
            #print("Data cleaning and conversion completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

    print('end of call')
run_all_operations() 



In [None]:
# OLD CODE GET RID OF 
# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Fetch and print data before conversion
        print("Data before conversion:")
        data_before = fetch_data(connection, 'orders_table')
        #data_before returns a list of tuples, each tuple is a row in the database, the list is all the rows 
        for row in data_before:
            print(row)
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            print('Convert to UUID worked')
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Fetch and print data after conversion
        print("Data after conversion:")
        data_after = fetch_data(connection, 'orders_table')
        for row in data_after:
            print(row)
        
        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')

run_all_operations()

In [24]:
# OLD CODE

#THIS WILL BE THE FUNCTION THAT RUNS ALL THE OTHER SUBFUNCTIONS. BUT NEED TO DECIDE IF IT"S EASIER TO PUT IT IN CLASS ('CASTING') WITH EACH SUB FUNCTION AS A METHOD

class DataCasting:

    def __init__(self):
        #connection = XXXX 
    
def cast_data(): 
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 

    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()

    with engine.connect() as connection:    
#THIS IS WHERE I'M TRYING TO BUILD REUSABLE FUNCTIONS FOR EACH OF THE CAST TYPES AND APPLYING THEM 


#UUID 
    # Create function to clean uuid with regex 
    def clean_uuid(connection, table_name, column_name):
        clean_uuid = f"""
            UPDATE {table_name}
            SET {column_name} = NULL
            WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
            """  
        connection.execute(text(clean_uuid))

    def convert_data_uuid(connection, table_name, column_name): 
        convert_date_uuid = f"""
            ALTER TABLE {table_name}
            ALTER COLUMN {column_name} TYPE UUID
            USING {column_name}::UUID;
            """
        # Convert the SQL string to a TextClause object and execute the query
        connection.execute(text(convert_date_uuid))

#VARCHAR 

    # Create function to clean card number data with regex
    def clean_numbers(connection, table_name, column_name):
        clean_numbers_sql = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9]+$';
        """  
        connection.execute(text(clean_numbers_sql))

    # Function to determine the maximum length of values in the column
    def get_max_length(connection, table_name, column_name):
        max_length_sql = f"""
        SELECT MAX(LENGTH({column_name})) 
        FROM {table_name};
        """
        result = connection.execute(text(max_length_sql)).fetchone()
        return result[0]

    # Function to convert the column to VARCHAR with the determined maximum length
    def convert_numbers(connection, table_name, column_name, max_length): 
        convert_numbers_sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE VARCHAR({max_length})
        USING {column_name}::VARCHAR({max_length});
        """
        connection.execute(text(convert_numbers_sql))

    # Function to run all operations
    def run_all_operations(conn_string, table_name, column_name):
        engine = create_engine(conn_string)
        with engine.connect() as connection:
            try:
                clean_numbers(connection, table_name, column_name)
                max_length = get_max_length(connection, table_name, column_name)
                convert_numbers(connection, table_name, column_name, max_length)
                print("Data cleaning and conversion completed successfully.")
            except SQLAlchemyError as e:
                print(f"An error occurred: {e}")


#SMALL INT 



#THESE ARE SAMPLES FROM THE CHAT GPT CODE 
    def cast_column_to_integer(connection, table_name, column_name):
        sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE INTEGER
        USING {column_name}::INTEGER;
        """
        connection.execute(text(sql))

    def cast_column_to_date(connection, table_name, column_name):
        sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE DATE
        USING {column_name}::DATE;
        """
        connection.execute(text(sql))

#THIS IS MY ORIGINAL CODE THAT WORKED, BUT I REALISED IT WILL GET VERY LONG IF I DO ALL THE CASTING AND APPLYING FOR EVERY COLUMN 

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        try:
            # Define the SQL query to clean data
            clean_date_uuid = """
            UPDATE orders_table
            SET date_uuid = NULL
            WHERE date_uuid !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
            """  
            
            # Convert the SQL string to a TextClause object
            clean_date_query = text(clean_date_uuid)
            
            # Execute the query
            connection.execute(clean_date_query)

            # Define the SQL query to alter the column data type
            convert_date_uuid = """
            ALTER TABLE orders_table
            ALTER COLUMN date_uuid TYPE UUID
            USING date_uuid::UUID;
            """
            # Convert the SQL string to a TextClause object
            convert_date_query = text(convert_date_uuid)
            
            # Execute the query
            connection.execute(convert_date_query)

            print("Data type casting completed successfully.")
       
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

In [None]:
# OLD CODE 
# THIS IS THE CHATGPT CODE THAT I AM TAKING BITS FROM - IT'S GOT IDEAS FOR CREATING REUSABLE FUNCTIONS FOR EACH OF THE TYPES OF CASTING I NEED TO DO 

from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

def cast_column_to_integer(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE INTEGER
    USING {column_name}::INTEGER;
    """
    connection.execute(text(sql))

def cast_column_to_date(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE DATE
    USING {column_name}::DATE;
    """
    connection.execute(text(sql))

def cast_column_to_boolean(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE BOOLEAN
    USING {column_name}::BOOLEAN;
    """
    connection.execute(text(sql))

# Add more functions as needed for different types

def run_all_casting_operations(conn_string):
    engine = create_engine(conn_string)
    with engine.connect() as connection:
        try:
            cast_column_to_integer(connection, 'users', 'age')
            cast_column_to_date(connection, 'users', 'birthdate')
            cast_column_to_boolean(connection, 'users', 'is_active')
            # Add more casting operations as needed
            print("All casting operations completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

if __name__ == '__main__':
    conn_string = 'postgresql://username:password@localhost:5432/mydatabase'
    run_all_casting_operations(conn_string)


In [None]:
cast_data()

In [None]:
#THIS IS OLD CODE THAT I PROBABLY WON'T USE 
def run_data_casting(conn_string):
    """
    Connects to the PostgreSQL database and performs data type casting
    on the 'age' column in the 'users' table.
    
    Args:
    conn_string (str): The connection string for the PostgreSQL database.
    """
    # Create an engine to connect to the database
    engine = create_engine(conn_string)
    
    # Use a context manager to handle the connection
    with engine.connect() as connection:
        try:
            # Define the SQL query to clean data
            clean_data_sql = """
            UPDATE users
            SET age = NULL
            WHERE age !~ '^[0-9]+$';
            """
            # Convert the SQL string to a TextClause object
            clean_data_query = text(clean_data_sql)
            
            # Execute the query
            connection.execute(clean_data_query)

            # Define the SQL query to alter the column data type
            alter_column_sql = """
            ALTER TABLE users
            ALTER COLUMN age TYPE INTEGER
            USING age::INTEGER;
            """
            # Convert the SQL string to a TextClause object
            alter_column_query = text(alter_column_sql)
            
            # Execute the query
            connection.execute(alter_column_query)

            print("Data type casting completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

In [None]:

# #VARCHAR 

#     # Create function to clean card number data with regex
#     def clean_numbers(connection, table_name, column_name):
#         clean_numbers_sql = f"""
#         UPDATE {table_name}
#         SET {column_name} = NULL
#         WHERE {column_name} !~ '^[0-9]+$';
#         """  
#         connection.execute(text(clean_numbers_sql))

#     # Function to determine the maximum length of values in the column
#     def get_max_length(connection, table_name, column_name):
#         max_length_sql = f"""
#         SELECT MAX(LENGTH({column_name})) 
#         FROM {table_name};
#         """
#         result = connection.execute(text(max_length_sql)).fetchone()
#         return result[0]

#     # Function to convert the column to VARCHAR with the determined maximum length
#     def convert_numbers(connection, table_name, column_name, max_length): 
#         convert_numbers_sql = f"""
#         ALTER TABLE {table_name}
#         ALTER COLUMN {column_name} TYPE VARCHAR({max_length})
#         USING {column_name}::VARCHAR({max_length});
#         """
#         connection.execute(text(convert_numbers_sql))

  
