In [10]:
import pandas as pd
from database_utils import DatabaseConnector
import re 
from sqlalchemy import create_engine, text, insert 
from sqlalchemy.inspection import inspect
from sqlalchemy.exc import SQLAlchemyError

In [None]:
#START HERE FROM 17th JULY 
#move onto to task 2. You might be able to reuse some of the functions for converting the data 
# Might have to do some changing to VARCHAR to have it as 255, e.g. make that the max_length? 
# Will have to write a new one for date

In [21]:
# CURRENT CODE  

# Create function to fetch data from the table in my local SQL server so that I can get the data to perform checks on it 
def fetch_data(connection, table_name, limit=5):
    fetch_data_query = f"SELECT * FROM {table_name} LIMIT {limit};"
    result = connection.execute(text(fetch_data_query))
    return result.fetchall()

# Function to check the column data type to check if the column type has been correctly converted 
def check_column_type(connection, table_name, column_name):
    check_column_type_query = f"""
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = '{table_name}' AND column_name = '{column_name}';
    """
    result = connection.execute(text(check_column_type_query))
    return result.fetchone()

# UUID 

# Create function to clean uuid with regex 
def clean_uuid(connection, table_name, column_name):
    clean_uuid = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
        """  
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(clean_uuid))

# Create function to update specified column to UUID 
def convert_data_uuid(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE UUID
        USING {column_name}::UUID;
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# SECTION TO CLEAN AND CONVERT THE FOLLOWING TYPES TO VARCHAR: card_number, store_code and product_code  

# CLEANING FUNCTIONS FOR: card_number, store_code and product_code  

# Create function to clean numeric data (card number or product_quantity) data with regex by ensuring they are numbers 
def clean_numbers(connection, table_name, column_name):
    clean_numbers_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[0-9]+$';
    """
    connection.execute(text(clean_numbers_sql))

# Create function to clean store_code and product_code data with regex (e.g. store_code 'BL-8387506C', product_code 'R7-3126933h')
def clean_store_or_product_codes(connection, table_name, column_name):
    clean_numbers_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[A-Za-z0-9]+-[A-Za-z0-9]+$';
    """
    connection.execute(text(clean_numbers_sql))

#CONVERTING FUNCTIONS FOR: card_number, store_code and product_code. (getting the max length of a value and converting it to VARCHAR(max_length)

# Function to determine the maximum length of values in the column
def get_max_length(connection, table_name, column_name):
    max_length_sql = f"""
    SELECT MAX(LENGTH(CAST({column_name} AS TEXT))) 
    FROM {table_name};
    """
    result = connection.execute(text(max_length_sql)).fetchone()
    return result[0]

# Function to convert the column to VARCHAR with the determined maximum length
def convert_to_varchar(connection, table_name, column_name, length): 
    convert_numbers_sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE VARCHAR({length})
    USING CAST({column_name} AS VARCHAR({length}));
    """
    connection.execute(text(convert_numbers_sql))

# PRODUCT QUANTITY 

# Create function to convert bigint to smalint 
def convert_bigint_to_smallint(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE SMALLINT
        USING CAST({column_name} AS SMALLINT);
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# TEXT DATA 

# cleaning text data 
def clean_text_data(connection, table_name, column_name):
    clean_numbers_sql = f"""
    UPDATE {table_name}
    SET {column_name} = NULL
    WHERE CAST({column_name} AS TEXT) !~ '^[0-9]+$';
    """
    connection.execute(text(clean_numbers_sql))

# Cleaning date of birth 

# Create function to clean uuid with regex 
def clean_uuid(connection, table_name, column_name):
    clean_uuid = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
        """  
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(clean_uuid))

# converting date of birth 

def clean_date_data(connection, table_name, column_name):
    clean_date_sql = f"""
    UPDATE {table_name}
    SET {column_name} = 
    TO_DATE(
        REGEXP_REPLACE(
            {column_name}, 
            '\\((\\d+), (\\d+), (\\d+), \\d+, \\d+\\)', 
            '\\1-\\2-\\3'
        ), 
        'YYYY-MM-DD'
    )
    WHERE {column_name} ~ '\\(\\d+, \\d+, \\d+, \\d+, \\d+\\)';
    """
    connection.execute(text(clean_date_sql))

def convert_to_date(connection, table_name, column_name): 
    convert_date_sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE DATE
    USING {column_name}::DATE;
    """
    connection.execute(text(convert_date_sql))

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'dim_users', 'last_name')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            # Cleaning then converting the date_uuid
            clean_uuid(connection, 'orders_table', 'date_uuid')
            #print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            #print('Convert to UUID worked')
            
            # Cleaning then converting the user_uuid
            clean_uuid(connection, 'orders_table', 'user_uuid')
            #print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'user_uuid')
            #print('Convert to UUID worked')
            
            # Cleaning then converting the card_number
            clean_numbers(connection, 'orders_table', 'card_number')
            #print('Clean card numbers worked')
            max_length = get_max_length(connection, 'orders_table', 'card_number')
            convert_to_varchar(connection, 'orders_table', 'card_number', max_length)
            #print('Convert card numbers worked')

            # Cleaning then converting store_code
            clean_store_or_product_codes(connection, 'orders_table', 'store_code')
            max_length = get_max_length(connection, 'orders_table', 'store_code')
            convert_to_varchar(connection, 'orders_table', 'store_code', max_length)
            print('Convert store_code worked')

            # Cleaning then converting product_code
            clean_store_or_product_codes(connection, 'orders_table', 'product_code')
            max_length = get_max_length(connection, 'orders_table', 'product_code')
            convert_to_varchar(connection, 'orders_table', 'product_code', max_length)
            print('Convert product_code worked')

            # Cleaning then converting the product_quantity
            clean_numbers(connection, 'orders_table', 'product_quantity')
            print('Clean product quantity has worked')
            convert_bigint_to_smallint(connection, 'orders_table', 'product_quantity')
            print('Convert product quantity has worked')

            # Cleaning then converting first_name
            clean_text_data(connection, 'dim_users', 'first_name')
            print('Clean first_name has worked')
            convert_to_varchar(connection, 'dim_users', 'first_name', 255)
            print('Convert first_name worked')

            # Cleaning then converting first_name
            clean_text_data(connection, 'dim_users', 'last_name')
            print('Clean last_name has worked')
            convert_to_varchar(connection, 'dim_users', 'last_name', 255)
            print('Convert last_name worked')

            clean_date_data(connection, 'dim_users', 'date_of_birth')
            print('Date data cleaning worked')
            convert_to_date(connection, 'dim_users', 'date_of_birth')
            print('Conversion to DATE worked')

        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'dim_users', 'last_name')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')

run_all_operations()

init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!
Column type before conversion: ('last_name', 'text')
Convert store_code worked
Convert product_code worked
Clean product quantity has worked
Convert product quantity has worked
Clean first_name has worked
Convert first_name worked
Clean last_name has worked
Convert last_name worked
An error occurred: (psycopg2.errors.UndefinedFunction) operator does not exist: timestamp without time zone ~ unknown
LINE 12:     WHERE date_of_birth ~ '\(\d+, \d+, \d+, \d+, \d+\)';
                                 ^
HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.

[SQL: 
    UPDATE dim_users
    SET date_of_birth = 
    TO_DATE(
        REGEXP_REPLACE(
            date_of_birth, 
            '\((\d+), (\d+), (\d+), \d+, \d+\)', 
            '\1-\2-\3'
        ), 
        'YYYY-MM-DD'
    )
    WHERE date_of_birth ~ '\(\d+, \d+, \d+, \d+, \d

InternalError: (psycopg2.errors.InFailedSqlTransaction) current transaction is aborted, commands ignored until end of transaction block

[SQL: 
        SELECT column_name, data_type
        FROM information_schema.columns
        WHERE table_name = 'dim_users' AND column_name = 'last_name';
    ]
(Background on this error at: https://sqlalche.me/e/20/2j85)

In [18]:
#CHECK THAT ALL NAMES IN FIRST NAME COLUMN ARE ALPHABETIC 

def view_data(): 
    # Create engine 
    instance = DatabaseConnector() 

    engine = instance.init_my_db_engine()


    # Define the query to display column headers and first few rows
    query = "SELECT first_name FROM dim_users LIMIT 5;"

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        result = connection.execute(text(query))
        # Print the column headers
        display(result.keys())
        # Initialise the list to store the odd names 
        odd_names = []
        # create the regex pattern
        pattern = re.compile('^[A-Za-z]+$')
        # iterate through each row in the result 
        for row in result:
            # Access the first element of the tuple to get the name
            name = row[0]
            if not pattern.match(name):
                odd_names.append(name)
                 
        #display(row)
        display(odd_names)
view_data()



init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!


RMKeyView(['date_of_birth'])

TypeError: expected string or bytes-like object, got 'datetime.datetime'

In [20]:
#THIS IS CODE SO THAT I CAN SEE WHAT THE TABLE LOOKS LIKE 
def view_data(): 
    # Create engine 
    instance = DatabaseConnector() 

    engine = instance.init_my_db_engine()


    # Define the query to display column headers and first few rows
    query = "SELECT * FROM dim_users LIMIT 10;"

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        result = connection.execute(text(query))
        # Print the column headers
        display(result.keys())
        # Print each row
        for row in result:
            display(row)

view_data()

init_my_db_engine is working
read_my_db_creds is working
Connection to the PostgreSQL database was successful!


RMKeyView(['index', 'first_name', 'last_name', 'date_of_birth', 'company', 'email_address', 'address', 'country', 'country_code', 'phone_number', 'join_date', 'user_uuid'])

(0, 'sigfried', 'noack', datetime.datetime(1990, 9, 30, 0, 0), 'Heydrich Junitz KG', 'rudi79@winkler.de', 'Zimmerstr. 1/0\n59015 Gießen', 'Germany', 'DE', '+49(0) 047905356', datetime.datetime(2018, 10, 10, 0, 0), '93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8')

(1, 'guy', 'allen', datetime.datetime(1940, 12, 1, 0, 0), 'Fox Ltd', 'rhodesclifford@henderson.com', 'Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH', 'United Kingdom', 'GB', '(0161) 496 0674', datetime.datetime(2001, 12, 20, 0, 0), '8fe96c3a-d62d-4eb5-b313-cf12d9126a49')

(2, 'harry', 'lawrence', datetime.datetime(1995, 8, 2, 0, 0), 'Johnson, Jones and Harris', 'glen98@bryant-marshall.co.uk', '92 Ann drive\nJoanborough\nSK0 6LR', 'United Kingdom', 'GB', '+44(0)121 4960340', datetime.datetime(2016, 12, 16, 0, 0), 'fc461df4-b919-48b2-909e-55c95a03fe6b')

(3, 'darren', 'hussain', datetime.datetime(1972, 9, 23, 0, 0), 'Wheeler LLC', 'daniellebryan@thompson.org', '19 Robinson meadow\nNew Tracy\nW22 2QG', 'United Kingdom', 'GB', '(0306) 999 0871', datetime.datetime(2004, 2, 23, 0, 0), '6104719f-ef14-4b09-bf04-fb0c4620acb0')

(4, 'garry', 'stone', datetime.datetime(1952, 12, 20, 0, 0), 'Warner Inc', 'billy14@long-warren.com', '3 White pass\nHunterborough\nNN96 4UE', 'United Kingdom', 'GB', '0121 496 0225', datetime.datetime(2006, 9, 1, 0, 0), '9523a6d3-b2dd-4670-a51a-36aebc89f579')

(5, 'david', 'torres', datetime.datetime(1949, 8, 12, 0, 0), 'Yang-Stewart', 'mwilliams@nichols.org', '49226 Edwards Mountains\nNorth Sarah, DE 69608', 'United States', 'US', '277-664-6389x8405', datetime.datetime(2002, 1, 21, 0, 0), '53d21f46-1fa4-452f-a023-26aee2aae4d6')

(6, 'anne', 'morris', datetime.datetime(1952, 11, 10, 0, 0), 'Hutchinson Inc', 'nhudson@taylor-horton.com', '33 Shaun locks\nMorganland\nG8 9YP', 'United Kingdom', 'GB', '028 9018749', datetime.datetime(2004, 6, 23, 0, 0), 'e2066a2c-8cd3-46ad-b2ea-e2445d5d9335')

(7, 'louis', 'roberts', datetime.datetime(2006, 8, 5, 0, 0), 'Hamilton, Walters and Clayton', 'joanne04@jennings-watson.com', 'Flat 68\nHamilton meadows\nNeilbury\nS88 8AP', 'United Kingdom', 'GB', '+44(0)1414960221', datetime.datetime(2008, 4, 19, 0, 0), 'bd690c60-c952-40c0-82df-0f8b6797b562')

(8, 'kathleen', 'barlow', datetime.datetime(1959, 11, 13, 0, 0), 'Ferguson Ltd', 'garymorton@clarke.com', '116 Smith junctions\nRichardsonmouth\nCO67 2LJ', 'United Kingdom', 'GB', '028 9018 0338', datetime.datetime(1993, 7, 10, 0, 0), '02de2416-4baf-42ad-bae6-d716eca0fc3f')

(9, 'emily', 'jones', datetime.datetime(1992, 10, 9, 0, 0), 'Serrano-Leblanc', 'toni24@carlson.com', '012 Andrea Circle Suite 129\nJosephmouth, AS 22237', 'United States', 'US', '6554215915', datetime.datetime(1998, 5, 15, 0, 0), 'caffe463-4918-4f45-a37d-856dc0f15884')

#Ignore below 


In [None]:
#BACKUP OF CHATGPT UUID CODE WITH CHECKS 

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Fetch and print data before conversion
        print("Data before conversion:")
        data_before = fetch_data(connection, 'orders_table')
        for row in data_before:
            print(row)
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            print('Convert to UUID worked')
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Fetch and print data after conversion
        print("Data after conversion:")
        data_after = fetch_data(connection, 'orders_table')
        for row in data_after:
            print(row)
        
        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')


In [None]:
#THINK THIS CAN BE GOTTEN RID OF 
#def cast_data(): 
  
#UUID 
# Create function to clean uuid with regex 
def clean_uuid(connection, table_name, column_name):
    clean_uuid = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
        """  
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(clean_uuid))

# Create function to update specified column to UUID 
def convert_data_uuid(connection, table_name, column_name): 
    convert_date_uuid = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE UUID
        USING {column_name}::UUID;
        """
    # Convert the SQL string to a TextClause object and execute the query
    connection.execute(text(convert_date_uuid))

# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('clean uuid worked')
            #max_length = get_max_length(connection, table_name, column_name)
            #convert_numbers(connection, table_name, column_name, max_length)
            #print("Data cleaning and conversion completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

    print('end of call')
run_all_operations() 



In [None]:
# OLD CODE GET RID OF 
# Function to run all operations
def run_all_operations():
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 
    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()
    
    #try to do engine.connect() 
    with engine.connect() as connection:
        
        # Fetch and print data before conversion
        print("Data before conversion:")
        data_before = fetch_data(connection, 'orders_table')
        #data_before returns a list of tuples, each tuple is a row in the database, the list is all the rows 
        for row in data_before:
            print(row)
        
        # Check and print column type before conversion
        column_type_before = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type before conversion: {column_type_before}")
        
        #put the attempt to run the functions in a try block 
        try:
            clean_uuid(connection, 'orders_table', 'date_uuid')
            print('Clean UUID worked')
            convert_data_uuid(connection, 'orders_table', 'date_uuid')
            print('Convert to UUID worked')
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

        # Fetch and print data after conversion
        print("Data after conversion:")
        data_after = fetch_data(connection, 'orders_table')
        for row in data_after:
            print(row)
        
        # Check and print column type after conversion
        column_type_after = check_column_type(connection, 'orders_table', 'date_uuid')
        print(f"Column type after conversion: {column_type_after}")

    print('End of call')

run_all_operations()

In [24]:
# OLD CODE

#THIS WILL BE THE FUNCTION THAT RUNS ALL THE OTHER SUBFUNCTIONS. BUT NEED TO DECIDE IF IT"S EASIER TO PUT IT IN CLASS ('CASTING') WITH EACH SUB FUNCTION AS A METHOD

class DataCasting:

    def __init__(self):
        #connection = XXXX 
    
def cast_data(): 
    
    # Create instance of a DatabaseConnector  
    instance = DatabaseConnector() 

    # Create an engine by using the init_my_db_engine() method of DatabaseConnector 
    engine = instance.init_my_db_engine()

    with engine.connect() as connection:    
#THIS IS WHERE I'M TRYING TO BUILD REUSABLE FUNCTIONS FOR EACH OF THE CAST TYPES AND APPLYING THEM 


#UUID 
    # Create function to clean uuid with regex 
    def clean_uuid(connection, table_name, column_name):
        clean_uuid = f"""
            UPDATE {table_name}
            SET {column_name} = NULL
            WHERE {column_name} !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
            """  
        connection.execute(text(clean_uuid))

    def convert_data_uuid(connection, table_name, column_name): 
        convert_date_uuid = f"""
            ALTER TABLE {table_name}
            ALTER COLUMN {column_name} TYPE UUID
            USING {column_name}::UUID;
            """
        # Convert the SQL string to a TextClause object and execute the query
        connection.execute(text(convert_date_uuid))

#VARCHAR 

    # Create function to clean card number data with regex
    def clean_numbers(connection, table_name, column_name):
        clean_numbers_sql = f"""
        UPDATE {table_name}
        SET {column_name} = NULL
        WHERE {column_name} !~ '^[0-9]+$';
        """  
        connection.execute(text(clean_numbers_sql))

    # Function to determine the maximum length of values in the column
    def get_max_length(connection, table_name, column_name):
        max_length_sql = f"""
        SELECT MAX(LENGTH({column_name})) 
        FROM {table_name};
        """
        result = connection.execute(text(max_length_sql)).fetchone()
        return result[0]

    # Function to convert the column to VARCHAR with the determined maximum length
    def convert_numbers(connection, table_name, column_name, max_length): 
        convert_numbers_sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE VARCHAR({max_length})
        USING {column_name}::VARCHAR({max_length});
        """
        connection.execute(text(convert_numbers_sql))

    # Function to run all operations
    def run_all_operations(conn_string, table_name, column_name):
        engine = create_engine(conn_string)
        with engine.connect() as connection:
            try:
                clean_numbers(connection, table_name, column_name)
                max_length = get_max_length(connection, table_name, column_name)
                convert_numbers(connection, table_name, column_name, max_length)
                print("Data cleaning and conversion completed successfully.")
            except SQLAlchemyError as e:
                print(f"An error occurred: {e}")


#SMALL INT 



#THESE ARE SAMPLES FROM THE CHAT GPT CODE 
    def cast_column_to_integer(connection, table_name, column_name):
        sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE INTEGER
        USING {column_name}::INTEGER;
        """
        connection.execute(text(sql))

    def cast_column_to_date(connection, table_name, column_name):
        sql = f"""
        ALTER TABLE {table_name}
        ALTER COLUMN {column_name} TYPE DATE
        USING {column_name}::DATE;
        """
        connection.execute(text(sql))

#THIS IS MY ORIGINAL CODE THAT WORKED, BUT I REALISED IT WILL GET VERY LONG IF I DO ALL THE CASTING AND APPLYING FOR EVERY COLUMN 

    # Use a context manager to handle the connection
    with engine.connect() as connection:  
        try:
            # Define the SQL query to clean data
            clean_date_uuid = """
            UPDATE orders_table
            SET date_uuid = NULL
            WHERE date_uuid !~ '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$';
            """  
            
            # Convert the SQL string to a TextClause object
            clean_date_query = text(clean_date_uuid)
            
            # Execute the query
            connection.execute(clean_date_query)

            # Define the SQL query to alter the column data type
            convert_date_uuid = """
            ALTER TABLE orders_table
            ALTER COLUMN date_uuid TYPE UUID
            USING date_uuid::UUID;
            """
            # Convert the SQL string to a TextClause object
            convert_date_query = text(convert_date_uuid)
            
            # Execute the query
            connection.execute(convert_date_query)

            print("Data type casting completed successfully.")
       
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

In [None]:
# OLD CODE 
# THIS IS THE CHATGPT CODE THAT I AM TAKING BITS FROM - IT'S GOT IDEAS FOR CREATING REUSABLE FUNCTIONS FOR EACH OF THE TYPES OF CASTING I NEED TO DO 

from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

def cast_column_to_integer(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE INTEGER
    USING {column_name}::INTEGER;
    """
    connection.execute(text(sql))

def cast_column_to_date(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE DATE
    USING {column_name}::DATE;
    """
    connection.execute(text(sql))

def cast_column_to_boolean(connection, table_name, column_name):
    sql = f"""
    ALTER TABLE {table_name}
    ALTER COLUMN {column_name} TYPE BOOLEAN
    USING {column_name}::BOOLEAN;
    """
    connection.execute(text(sql))

# Add more functions as needed for different types

def run_all_casting_operations(conn_string):
    engine = create_engine(conn_string)
    with engine.connect() as connection:
        try:
            cast_column_to_integer(connection, 'users', 'age')
            cast_column_to_date(connection, 'users', 'birthdate')
            cast_column_to_boolean(connection, 'users', 'is_active')
            # Add more casting operations as needed
            print("All casting operations completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

if __name__ == '__main__':
    conn_string = 'postgresql://username:password@localhost:5432/mydatabase'
    run_all_casting_operations(conn_string)


In [None]:
cast_data()

In [None]:
#THIS IS OLD CODE THAT I PROBABLY WON'T USE 
def run_data_casting(conn_string):
    """
    Connects to the PostgreSQL database and performs data type casting
    on the 'age' column in the 'users' table.
    
    Args:
    conn_string (str): The connection string for the PostgreSQL database.
    """
    # Create an engine to connect to the database
    engine = create_engine(conn_string)
    
    # Use a context manager to handle the connection
    with engine.connect() as connection:
        try:
            # Define the SQL query to clean data
            clean_data_sql = """
            UPDATE users
            SET age = NULL
            WHERE age !~ '^[0-9]+$';
            """
            # Convert the SQL string to a TextClause object
            clean_data_query = text(clean_data_sql)
            
            # Execute the query
            connection.execute(clean_data_query)

            # Define the SQL query to alter the column data type
            alter_column_sql = """
            ALTER TABLE users
            ALTER COLUMN age TYPE INTEGER
            USING age::INTEGER;
            """
            # Convert the SQL string to a TextClause object
            alter_column_query = text(alter_column_sql)
            
            # Execute the query
            connection.execute(alter_column_query)

            print("Data type casting completed successfully.")
        except SQLAlchemyError as e:
            print(f"An error occurred: {e}")

In [None]:

# #VARCHAR 

#     # Create function to clean card number data with regex
#     def clean_numbers(connection, table_name, column_name):
#         clean_numbers_sql = f"""
#         UPDATE {table_name}
#         SET {column_name} = NULL
#         WHERE {column_name} !~ '^[0-9]+$';
#         """  
#         connection.execute(text(clean_numbers_sql))

#     # Function to determine the maximum length of values in the column
#     def get_max_length(connection, table_name, column_name):
#         max_length_sql = f"""
#         SELECT MAX(LENGTH({column_name})) 
#         FROM {table_name};
#         """
#         result = connection.execute(text(max_length_sql)).fetchone()
#         return result[0]

#     # Function to convert the column to VARCHAR with the determined maximum length
#     def convert_numbers(connection, table_name, column_name, max_length): 
#         convert_numbers_sql = f"""
#         ALTER TABLE {table_name}
#         ALTER COLUMN {column_name} TYPE VARCHAR({max_length})
#         USING {column_name}::VARCHAR({max_length});
#         """
#         connection.execute(text(convert_numbers_sql))

  
