In [1]:
import pandas as pd
from database_utils import DatabaseConnector
from sqlalchemy import create_engine, text, insert 
from sqlalchemy.inspection import inspect
from sqlalchemy.exc import SQLAlchemyError

In [None]:
#START HERE FROM 11th JULY 
#need to work through the example cell below to continue to build up the functons for each of the different data types,
# maybe test as you go rather than wait until you've done them all 

In [17]:
# HELPER FUNCTIONS 

# Function to fetch data from the table
def fetch_data(connection, table_name, limit=5):
    fetch_data_query = f"SELECT * FROM {table_name} LIMIT {limit};"
    result = connection.execute(text(fetch_data_query))
    return result.fetchall()

# Function to check the column data type
def check_column_type(connection, table_name, column_name):
    check_column_type_query = f"""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = '{table_name}' AND column_name = '{column_name}';
    """
    result = connection.execute(text(check_column_type_query))
    return result.fetchone()

# UUID 

# Create function to clean uuid with regex 
def clean_uuid(connection, table_name, column_name):
    clean_uuid = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
        """  
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(clean_uuid))

# Create function to update specified column to UUID 
def convert_data_uuid(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE UUID
        USING {column_name}::UUID;
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# SECTION TO CLEAN CONVERT THESE TYPES TO VARCHAR: card_number, store_code and product_code  

# CLEANING FUNCTIONS FOR: card_number, store_code and product_code  

# Create function to clean card number or product_quantity data with regex by ensuring they are numbers 
def clean_numbers(connection, table_name, column_name):
    clean_numbers_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[0-9]+$';
    """
    connection.execute(text(clean_numbers_sql))

# Create function to clean store_code and product_code data with regex (e.g. store_code 'BL-8387506C', product_code 'R7-3126933h')
def clean_store_or_product_codes(connection, table_name, column_name):
    clean_numbers_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[A-Za-z0-9]+-[A-Za-z0-9]+$';
    """
    connection.execute(text(clean_numbers_sql))

#CONVERTING FUNCTIONS FOR: card_number, store_code and product_code. (getting the max length of a value and converting it to VARCHAR(max_length)

# Function to determine the maximum length of values in the column
def get_max_length(connection, table_name, column_name):
    max_length_sql = f"""
    SELECT MAX(LENGTH(CAST({column_name} AS TEXT))) 
    FROM {table_name};
    """
    result = connection.execute(text(max_length_sql)).fetchone()
    return result[0]

# Function to convert the column to VARCHAR with the determined maximum length
def convert_to_varchar(connection, table_name, column_name, max_length): 
    convert_numbers_sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE VARCHAR({max_length})
    USING CAST({column_name} AS VARCHAR({max_length}));
    """
    connection.execute(text(convert_numbers_sql))

# PRODUCT QUANTITY 

# Create function to convert bigint to smalint 
def convert_bigint_to_smallint(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE SMALLINT
        USING CAST({column_name} AS SMALLINT);
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))


# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'orders_table', 'product_quantity')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            # Cleaning then converting the date_uuid
            clean_uuid(connection, 'orders_table', 'date_uuid')
            #print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            #print('Convert to UUID worked')
            
            # Cleaning then converting the user_uuid
            clean_uuid(connection, 'orders_table', 'user_uuid')
            #print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'user_uuid')
            #print('Convert to UUID worked')
            
            # Cleaning then converting the card_number
            clean_numbers(connection, 'orders_table', 'card_number')
            #print('Clean card numbers worked')
            max_length = get_max_length(connection, 'orders_table', 'card_number')
            convert_to_varchar(connection, 'orders_table', 'card_number', max_length)
            #print('Convert card numbers worked')

            # Cleaning then converting store_code
            clean_store_or_product_codes(connection, 'orders_table', 'store_code')
            max_length = get_max_length(connection, 'orders_table', 'store_code')
            convert_to_varchar(connection, 'orders_table', 'store_code', max_length)
            print('Convert store_code worked')

            # Cleaning then converting product_code
            clean_store_or_product_codes(connection, 'orders_table', 'product_code')
            max_length = get_max_length(connection, 'orders_table', 'product_code')
            convert_to_varchar(connection, 'orders_table', 'product_code', max_length)
            print('Convert product_code worked')

            # Cleaning then converting the product_quantity
            clean_numbers(connection, 'orders_table', 'product_quantity')
            print('Clean product quantity has worked')
            convert_bigint_to_smallint(connection, 'orders_table', 'product_quantity')
            print('Convert product quantity has worked')

        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'orders_table', 'product_quantity')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')

run_all_operations()

init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!
Column type before conversion: ('product_quantity', 'bigint')
Convert store_code worked
Convert product_code worked
Clean product quantity has worked
Convert product quantity has worked
Column type after conversion: ('product_quantity', 'smallint')
End of call


#Ignore below 


In [None]:
#BACKUP OF CHATGPT UUID CODE WITH CHECKS 

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Fetch and print data before conversion
        print("Data before conversion:")
        data_before = fetch_data(connection, 'orders_table')
        for row in data_before:
            print(row)
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            print('Convert to UUID worked')
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Fetch and print data after conversion
        print("Data after conversion:")
        data_after = fetch_data(connection, 'orders_table')
        for row in data_after:
            print(row)
        
        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')


In [4]:
#def cast_data(): 
  
#UUID 
# Create function to clean uuid with regex 
def clean_uuid(connection, table_name, column_name):
    clean_uuid = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
        """  
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(clean_uuid))

# Create function to update specified column to UUID 
def convert_data_uuid(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE UUID
        USING {column_name}::UUID;
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('clean uuid worked')
            #max_length = get_max_length(connection, table_name, column_name)
            #convert_numbers(connection, table_name, column_name, max_length)
            #print("Data cleaning and conversion completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

    print('end of call')
run_all_operations() 



init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!
clean uuid worked
end of call


In [None]:

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Fetch and print data before conversion
        print("Data before conversion:")
        data_before = fetch_data(connection, 'orders_table')
        #data_before returns a list of tuples, each tuple is a row in the database, the list is all the rows 
        for row in data_before:
            print(row)
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            print('Convert to UUID worked')
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Fetch and print data after conversion
        print("Data after conversion:")
        data_after = fetch_data(connection, 'orders_table')
        for row in data_after:
            print(row)
        
        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')

run_all_operations()

In [24]:
# THIS IS THE CODE THAT I EVENTUALLY WANT TO USE 

#THIS WILL BE THE FUNCTION THAT RUNS ALL THE OTHER SUBFUNCTIONS. BUT NEED TO DECIDE IF IT"S EASIER TO PUT IT IN CLASS ('CASTING') WITH EACH SUB FUNCTION AS A METHOD

class DataCasting:

    def __init__(self):
        #connection = XXXX 
    
def cast_data(): 
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 

    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()

    with engine.connect() as connection:    
#THIS IS WHERE I'M TRYING TO BUILD REUSABLE FUNCTIONS FOR EACH OF THE CAST TYPES AND APPLYING THEM 


#UUID 
    # Create function to clean uuid with regex 
    def clean_uuid(connection, table_name, column_name):
        clean_uuid = f"""
            UPDATE {table_name}
            SET {column_name} = NULL
            WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
            """  
        connection.execute(text(clean_uuid))

    def convert_data_uuid(connection, table_name, column_name): 
        convert_date_uuid = f"""
            ALTER TABLE {table_name}
            ALTER COLUMN {column_name} TYPE UUID
            USING {column_name}::UUID;
            """
        # Convert the SQL string to a TextClause object and execute the query
        connection.execute(text(convert_date_uuid))

#VARCHAR 

    # Create function to clean card number data with regex
    def clean_numbers(connection, table_name, column_name):
        clean_numbers_sql = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9]+$';
        """  
        connection.execute(text(clean_numbers_sql))

    # Function to determine the maximum length of values in the column
    def get_max_length(connection, table_name, column_name):
        max_length_sql = f"""
        SELECT MAX(LENGTH({column_name})) 
        FROM {table_name};
        """
        result = connection.execute(text(max_length_sql)).fetchone()
        return result[0]

    # Function to convert the column to VARCHAR with the determined maximum length
    def convert_numbers(connection, table_name, column_name, max_length): 
        convert_numbers_sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE VARCHAR({max_length})
        USING {column_name}::VARCHAR({max_length});
        """
        connection.execute(text(convert_numbers_sql))

    # Function to run all operations
    def run_all_operations(conn_string, table_name, column_name):
        engine = create_engine(conn_string)
        with engine.connect() as connection:
            try:
                clean_numbers(connection, table_name, column_name)
                max_length = get_max_length(connection, table_name, column_name)
                convert_numbers(connection, table_name, column_name, max_length)
                print("Data cleaning and conversion completed successfully.")
            except SQLAlchemyError as e:
                print(f"An error occurred: {e}")


#SMALL INT 



#THESE ARE SAMPLES FROM THE CHAT GPT CODE 
    def cast_column_to_integer(connection, table_name, column_name):
        sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE INTEGER
        USING {column_name}::INTEGER;
        """
        connection.execute(text(sql))

    def cast_column_to_date(connection, table_name, column_name):
        sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE DATE
        USING {column_name}::DATE;
        """
        connection.execute(text(sql))

#THIS IS MY ORIGINAL CODE THAT WORKED, BUT I REALISED IT WILL GET VERY LONG IF I DO ALL THE CASTING AND APPLYING FOR EVERY COLUMN 

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        try:
            # Define the SQL query to clean data
            clean_date_uuid = """
            UPDATE orders_table
            SET date_uuid = NULL
            WHERE date_uuid !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
            """  
            
            # Convert the SQL string to a TextClause object
            clean_date_query = text(clean_date_uuid)
            
            # Execute the query
            connection.execute(clean_date_query)

            # Define the SQL query to alter the column data type
            convert_date_uuid = """
            ALTER TABLE orders_table
            ALTER COLUMN date_uuid TYPE UUID
            USING date_uuid::UUID;
            """
            # Convert the SQL string to a TextClause object
            convert_date_query = text(convert_date_uuid)
            
            # Execute the query
            connection.execute(convert_date_query)

            print("Data type casting completed successfully.")
       
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

In [None]:
# THIS IS THE CHATGPT CODE THAT I AM TAKING BITS FROM - IT'S GOT IDEAS FOR CREATING REUSABLE FUNCTIONS FOR EACH OF THE TYPES OF CASTING I NEED TO DO 

from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

def cast_column_to_integer(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE INTEGER
    USING {column_name}::INTEGER;
    """
    connection.execute(text(sql))

def cast_column_to_date(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE DATE
    USING {column_name}::DATE;
    """
    connection.execute(text(sql))

def cast_column_to_boolean(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE BOOLEAN
    USING {column_name}::BOOLEAN;
    """
    connection.execute(text(sql))

# Add more functions as needed for different types

def run_all_casting_operations(conn_string):
    engine = create_engine(conn_string)
    with engine.connect() as connection:
        try:
            cast_column_to_integer(connection, 'users', 'age')
            cast_column_to_date(connection, 'users', 'birthdate')
            cast_column_to_boolean(connection, 'users', 'is_active')
            # Add more casting operations as needed
            print("All casting operations completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

if __name__ == '__main__':
    conn_string = 'postgresql://username:password@localhost:5432/mydatabase'
    run_all_casting_operations(conn_string)


In [25]:
cast_data()

init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!
Data type casting completed successfully.


In [28]:
#THIS IS CODE SO THAT I CAN SEE WHAT THE TABLE LOOKS LIKE 
def view_data(): 
    # Create engine 
    instance = DatabaseConnector() 

    engine = instance.init_my_db_engine()


    # Define the query to display column headers and first few rows
    query = "SELECT * FROM orders_table LIMIT 5;"

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        result = connection.execute(text(query))
        # Print the column headers
        display(result.keys())
        # Print each row
        for row in result:
            display(row)

In [29]:
view_data()

init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!


RMKeyView(['level_0', 'index', 'date_uuid', 'user_uuid', 'card_number', 'store_code', 'product_code', 'product_quantity'])

(0, 0, '9476f17e-5d6a-4117-874d-9cdb38ca1fa6', '93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8', 30060773296197, 'BL-8387506C', 'R7-3126933h', 3)

(1, 1, '0423a395-a04d-4e4a-bd0f-d237cbd5a295', '8fe96c3a-d62d-4eb5-b313-cf12d9126a49', 349624180933183, 'WEB-1388012W', 'C2-7287916l', 2)

(2, 2, '65187294-bb16-4519-adc0-787bbe423970', 'fc461df4-b919-48b2-909e-55c95a03fe6b', 3529023891650490, 'CH-01D85C8D', 'S7-1175877v', 2)

(3, 3, '579e21f7-13cb-436b-83ad-33687a4eb337', '6104719f-ef14-4b09-bf04-fb0c4620acb0', 213142929492281, 'CL-C183BE4B', 'D8-8421505n', 2)

(4, 4, '00ab86c3-2039-4674-b9c1-adbcbbf525bd', '9523a6d3-b2dd-4670-a51a-36aebc89f579', 502067329974, 'SO-B5B9CB3B', 'B6-2596063a', 2)

In [None]:
#THIS IS OLD CODE THAT I PROBABLY WON'T USE 
def run_data_casting(conn_string):
    """
    Connects to the PostgreSQL database and performs data type casting
    on the 'age' column in the 'users' table.
    
    Args:
    conn_string (str): The connection string for the PostgreSQL database.
    """
    # Create an engine to connect to the database
    engine = create_engine(conn_string)
    
    # Use a context manager to handle the connection
    with engine.connect() as connection:
        try:
            # Define the SQL query to clean data
            clean_data_sql = """
            UPDATE users
            SET age = NULL
            WHERE age !~ '^[0-9]+$';
            """
            # Convert the SQL string to a TextClause object
            clean_data_query = text(clean_data_sql)
            
            # Execute the query
            connection.execute(clean_data_query)

            # Define the SQL query to alter the column data type
            alter_column_sql = """
            ALTER TABLE users
            ALTER COLUMN age TYPE INTEGER
            USING age::INTEGER;
            """
            # Convert the SQL string to a TextClause object
            alter_column_query = text(alter_column_sql)
            
            # Execute the query
            connection.execute(alter_column_query)

            print("Data type casting completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

In [None]:

# #VARCHAR 

#     # Create function to clean card number data with regex
#     def clean_numbers(connection, table_name, column_name):
#         clean_numbers_sql = f"""
#         UPDATE {table_name}
#         SET {column_name} = NULL
#         WHERE {column_name} !~ '^[0-9]+$';
#         """  
#         connection.execute(text(clean_numbers_sql))

#     # Function to determine the maximum length of values in the column
#     def get_max_length(connection, table_name, column_name):
#         max_length_sql = f"""
#         SELECT MAX(LENGTH({column_name})) 
#         FROM {table_name};
#         """
#         result = connection.execute(text(max_length_sql)).fetchone()
#         return result[0]

#     # Function to convert the column to VARCHAR with the determined maximum length
#     def convert_numbers(connection, table_name, column_name, max_length): 
#         convert_numbers_sql = f"""
#         ALTER TABLE {table_name}
#         ALTER COLUMN {column_name} TYPE VARCHAR({max_length})
#         USING {column_name}::VARCHAR({max_length});
#         """
#         connection.execute(text(convert_numbers_sql))

  
