In [2]:
import pandas as pd
import logging 
import numpy as np
import re 
import uuid
from IPython.display import display
from sqlalchemy import MetaData, Table
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from data_cleaning import DataCleaning 

In [7]:

def clean_orders_data():

    """
    This method retrieves the 'orders_table' via the 'read_data_from_table' method from the DataExtractor 
    It then drops column '1', 'first_name' and 'last_name'

    Args: 
        None 

    Returns: 
        dataframe: A dataframe with column '1', 'first_name' and 'last_name' dropped  

    """

    #LOGGING, the start of the method 
    print('started clean_orders_data')

    #creating instance of dataextractor
    instance = DataExtractor()

    # get the 'orders_table' data via the 'read_data_from_table' method, and assign it to df 
    df = instance.read_data_from_table('orders_table')

    #display(df.info())
    #display(df.dtypes)

    # Check for non-string values
    non_string_values = df['card_number'].apply(lambda x: not isinstance(x, str))
    print("Non-string values:\n", df[non_string_values])

    # Convert all values to strings
    #df['card_number'] = df['card_number'].astype(str)

    # Cast 'date_uuid' column to UUID
    df['date_uuid'] = df['date_uuid'].apply(lambda value: uuid.UUID(value))

    # Cast 'user_uuid' column to UUID
    df['user_uuid'] = df['user_uuid'].apply(lambda value: uuid.UUID(value))

    # Get the maximum length for 'card_number' column
    max_length_card_number = df['card_number'].str.len().max()

    # Get the maximum length for 'store_code' column
    max_length_store_code = df['store_code'].str.len().max() 

    # Get the maximum length for 'product_code' column
    max_length_product_code= df['product_code'].str.len().max() 

    # Create VARCHAR column for 'card_number' with the right length 
    df['card_number'] = df['card_number'].astype(f'STR{max_length_card_number}')

    # Create VARCHAR column for 'store_code' with the right length 
    df['store_code'] = df['store_code'].astype(f'STR{max_length_store_code}')

    # Create VARCHAR column for 'product_code' with the right lengt
    df['product_code'] = df['product_code'].astype(f'STR{max_length_product_code}')

    # Cast 'bigint' column to SMALLINT, 'int16' is equivalent to SMALLINT
    df['product_quantity'] = df['product_quantity'].astype('int16')   

    # drop unwanted columns 
    df = df.drop('1', axis=1)
    df = df.drop('first_name', axis=1)
    df = df.drop('last_name', axis=1)

    # return the cleaned df 
    return df 


In [8]:
clean_orders_data()

started clean_orders_data
read_data_from_table is working
init_db_engine is working
read_db_creds is working
Non-string values:
         level_0   index                             date_uuid first_name  \
0             0       0  9476f17e-5d6a-4117-874d-9cdb38ca1fa6       None   
1             1       1  0423a395-a04d-4e4a-bd0f-d237cbd5a295       None   
2             2       2  65187294-bb16-4519-adc0-787bbe423970       None   
3             3       3  579e21f7-13cb-436b-83ad-33687a4eb337       None   
4             4       4  00ab86c3-2039-4674-b9c1-adbcbbf525bd       None   
...         ...     ...                                   ...        ...   
120118   110549  110548  f0e8fff6-9998-4661-954b-0e258e09d33c       None   
120119    82164   82164  1c80940a-d186-4ba9-9daa-8abd1aceae32       None   
120120    97599   97599  58598aca-049c-418e-8e39-46327634a7f1     Sharon   
120121   106591  106591  3a76f661-0707-4fbc-9862-f21d3249f581       None   
120122   118806  118804  98fdc8b3-f

TypeError: data type 'STR19' not understood