In [44]:
from sqlalchemy import MetaData, Table
import pandas as pd
from IPython.display import display
from database_utils import DatabaseConnector

class DataExtractor:
    def __init__(self):
        pass

    def read_data_from_table(self, table_name):
        db_connector = DatabaseConnector()  #make an instance of the DatabaseConnector() object because I'm going to use some of the methods of that class 
        engine = db_connector.init_db_engine() #make a database 'key' by using the method init_db_engine() from the instance of the DatabaseConnector object I've created
        
        # Create MetaData object, I need this so that I have a need a way to store and manage information about the database's structure. 
        # Without this catalog, it would be difficult to know how the data is organized and where to find it (it's like a list of all the what and where in the database) 
        metadata = MetaData()

        # A table object gives details about a particuar table. We pass in details about that object
        # table_name = what table to look at
        # metadata = helps the table object find the table 
        # autoload = tells SQLAlchemy to automatically load the table’s schema from the database
        # autoload_with = telling SQLAlchemy to use this specific connection to access the database and load the table’s details.
        try: 
            table = Table(table_name, metadata, autoload_with=engine)
        except Exception as e: 
            print(f"Error reflecting table {table_name}: {e}")
            return None
         
        try: 
            with engine.connect() as connection: # engine.connect() = creates a connection 
                select_query = table.select() # table.select() = requests some data from the table 
                result_of_query = connection.execute(select_query) # connection.execute(select_query) = this tells the connection to get the actual data (which is the result of 'table.select()) 
                data = result_of_query.fetchall() #result.fetchall() = takes all the results from connection.execute(select_query) and stores them in a vertiable so they can be worked with
                df = pd.DataFrame(data, columns=[column.name for column in table.columns]) # Convert the result to a DataFrame
        except Exception as e: 
            print(f"Error making database connection / retrieving data {table_name}: {e}") 
            return None 

        return df

    def read_rds_table(self):
        db_connector = DatabaseConnector()
        name_of_table = db_connector.list_db_tables() #uses list_db_tables to get the table names 
        dataframe = pd.DataFrame(name_of_table) #makes a dataframe out of the names that have been retrieved from list_db_tables method 
        return dataframe

# Create an instance of DataExtractor
instance = DataExtractor()

# Read data from the 'legacy_users' table
df = instance.read_data_from_table('legacy_users')

#display(df.head())  # Use display() function in Jupyter Notebook

df.head() 

read_db_creds is working
init_db_engine is working


Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579


In [45]:
print('this is the unique country values:')
print(df['country'].unique()) 

print('this is unique country_codes:')
print(df['country_code'].unique()) 

this is the unique country values:
['Germany' 'United Kingdom' 'United States' 'I7G4DMDZOZ' 'NULL'
 'AJ1ENKS3QL' 'XGI7FM0VBJ' 'S0E37H52ON' 'XN9NGL5C0B' '50KUU3PQUF'
 'EWE3U0DZIV' 'GMRBOMI0O1' 'YOTSVPRBQ7' '5EFAFD0JLI' 'PNRMPSYR1J'
 'RQRB7RMTAD' '3518UD5CE8' '7ZNO5EBALT' 'T4WBZSW0XI']
this is unique country_codes:
['DE' 'GB' 'US' 'VSM4IZ4EL3' 'NULL' 'QVUW9JSKY3' 'GGB' '0CU6LW3NKB'
 'PG8MOC0UZI' 'NTCGYW8LVC' 'FB13AKRI21' 'OS2P9CMHR6' '5D74J6FPFJ'
 'LZGTB0T5Z7' 'IM8MN1L9MJ' 'RVRFD92E48' 'XKI9UXSCZ1' 'QREF9WLI2A'
 'XPVCZE2L8B' '44YAIDY048']


In [46]:
print('this is the country values:')
print(df['country'].unique(), '\n') 

list_of_country_values = df['country'].unique()

print('this is my list of country values:\n', list_of_country_values, '\n')

# List of elements to exclude
exclude_list = ['Germany', 'United Kingdom', 'United States', 'NULL']

# Create a new list excluding the specified elements
list_of_values_to_drop = [item for item in list_of_country_values if item not in exclude_list]

print('this is my list of values to drop:', list_of_values_to_drop, '\n')

# Convert the list to a string that can be used in the query
query_str = 'country in @list_of_values_to_drop'

# Get the indices of the rows that match any value in my_list
list_of_indices = df.query(query_str).index.tolist()

print('this is my list of indices', list_of_indices, '\n')

cleaned_country_values_df = df.drop(index=list_of_indices)

print('unique values in original dataframe:')
print(df['country'].unique(), '\n') 

print('unique values in new dataframe:')
print(cleaned_country_values_df['country'].unique(), '\n')
#display(new_df.head(30)) 

this is the country values:
['Germany' 'United Kingdom' 'United States' 'I7G4DMDZOZ' 'NULL'
 'AJ1ENKS3QL' 'XGI7FM0VBJ' 'S0E37H52ON' 'XN9NGL5C0B' '50KUU3PQUF'
 'EWE3U0DZIV' 'GMRBOMI0O1' 'YOTSVPRBQ7' '5EFAFD0JLI' 'PNRMPSYR1J'
 'RQRB7RMTAD' '3518UD5CE8' '7ZNO5EBALT' 'T4WBZSW0XI'] 

this is my list of country values:
 ['Germany' 'United Kingdom' 'United States' 'I7G4DMDZOZ' 'NULL'
 'AJ1ENKS3QL' 'XGI7FM0VBJ' 'S0E37H52ON' 'XN9NGL5C0B' '50KUU3PQUF'
 'EWE3U0DZIV' 'GMRBOMI0O1' 'YOTSVPRBQ7' '5EFAFD0JLI' 'PNRMPSYR1J'
 'RQRB7RMTAD' '3518UD5CE8' '7ZNO5EBALT' 'T4WBZSW0XI'] 

this is my list of values to drop: ['I7G4DMDZOZ', 'AJ1ENKS3QL', 'XGI7FM0VBJ', 'S0E37H52ON', 'XN9NGL5C0B', '50KUU3PQUF', 'EWE3U0DZIV', 'GMRBOMI0O1', 'YOTSVPRBQ7', '5EFAFD0JLI', 'PNRMPSYR1J', 'RQRB7RMTAD', '3518UD5CE8', '7ZNO5EBALT', 'T4WBZSW0XI'] 

this is my list of indices [752, 1046, 2995, 3536, 5306, 6420, 8386, 9013, 10211, 10360, 11366, 12177, 13111, 14101, 14499] 

unique values in original dataframe:
['Germany' 'United Ki

In [47]:
#this is just checking text 
print('this is the new_df country values:')
print(cleaned_country_values_df['country'].unique()) 

print('this is new_df country_code:')
print(cleaned_country_values_df['country_code'].unique()) 

this is the new_df country values:
['Germany' 'United Kingdom' 'United States' 'NULL']
this is new_df country_code:
['DE' 'GB' 'US' 'NULL' 'GGB']


In [48]:
#this is correcting the GGB to GB 

print('this is the country_codes values before cleaning:', cleaned_country_values_df['country_code'].unique(), '\n') 

# Create a copy of the original dataframe
cleaned_country_values_and_codes_df = cleaned_country_values_df.copy()

# Replace 'GGB' with 'GB' in the 'country_code' column
cleaned_country_values_and_codes_df['country_code'] = cleaned_country_values_and_codes_df['country_code'].replace('GGB', 'GB')

print('ths is cleaned_country_values_df:', cleaned_country_values_df['country_code'].unique(), '\n') 

print('ths is cleaned_country_values__and_codes_df:', cleaned_country_values_and_codes_df['country_code'].unique(), '\n') 




this is the country_codes values before cleaning: ['DE' 'GB' 'US' 'NULL' 'GGB'] 

ths is cleaned_country_values_df: ['DE' 'GB' 'US' 'NULL' 'GGB'] 

ths is cleaned_country_values__and_codes_df: ['DE' 'GB' 'US' 'NULL'] 



In [49]:
# create a df of all null roles so that I can inspect them. (Because I want to know whether to drop them or not) 

list_of_null_rows = cleaned_country_values_and_codes_df.query('country == "NULL" or country_code == "NULL"').index.tolist()

print(list_of_null_rows)
print('length of null rows:', len(list_of_null_rows))
print('before dropping rows:', len(cleaned_country_values_and_codes_df))


null_rows_df = cleaned_country_values_and_codes_df.loc[list_of_null_rows]

#display(null_rows_df)

display(cleaned_country_values_and_codes_df)

#cleaned_country_values_and_codes_df = cleaned_country_values_and_codes_df.drop(index=[866, 1022, 1805, 2103, 2437, 2739, 2764, 4984, 5307, 6920, 7737, 10013, 10224, 10988, 11443, 11598, 11761, 11864, 12092, 12584, 13855])

cleaned_country_values_and_codes_df = cleaned_country_values_and_codes_df.drop(index=list_of_null_rows)

print('now after cleaning')
display(cleaned_country_values_and_codes_df)
print('after dropping rows:', len(cleaned_country_values_and_codes_df))



[866, 1022, 1805, 2103, 2437, 2739, 2764, 4984, 5307, 6920, 7737, 10013, 10224, 10988, 11443, 11598, 11761, 11864, 12092, 12584, 13855]
length of null rows: 21
before dropping rows: 15305


Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579
...,...,...,...,...,...,...,...,...,...,...,...,...
15315,14913,Stephen,Jenkins,1943-08-09,"Thornton, Carroll and Newman",s.jenkins@smith.com,Studio 41I\nJones lodge\nOliviaborough\nE8 3DU,United Kingdom,GB,+44(0)292018946,2016-04-15,2bd3a12f-a92d-4cdd-b99c-fc70572db302
15316,14994,Stephen,Smith,1948-08-20,Robinson-Harris,s.smith@smith.com,530 Young parkway\nMillsfurt\nL4G 7NX,United Kingdom,GB,+44(0)1144960977,2020-07-20,d234c04b-c07c-46a5-a902-526f91478ecc
15317,15012,Stephen,Losekann,1940-10-09,Rosenow,s.losekann@smith.com,Viviane-Fritsch-Straße 3/5\n15064 Bad Liebenwerda,Germany,DE,02984 08192,2021-03-07,1a0a8b7b-7c17-42d8-a946-8a85d5495651
15318,15269,Stephen,Rivera,1952-06-04,"Taylor, Fry and Jones",s.rivera@smith.com,"660 Ross Falls Suite 357\nAnthonymouth, MA 09610",United States,US,239.711.3836,2011-01-03,187fe06e-bd5f-4381-af2f-d7ac37ca7572


now after cleaning


Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid
0,0,Sigfried,Noack,1990-09-30,Heydrich Junitz KG,rudi79@winkler.de,Zimmerstr. 1/0\n59015 Gießen,Germany,DE,+49(0) 047905356,2018-10-10,93caf182-e4e9-4c6e-bebb-60a1a9dcf9b8
1,1,Guy,Allen,1940-12-01,Fox Ltd,rhodesclifford@henderson.com,Studio 22a\nLynne terrace\nMcCarthymouth\nTF0 9GH,United Kingdom,GB,(0161) 496 0674,2001-12-20,8fe96c3a-d62d-4eb5-b313-cf12d9126a49
2,2,Harry,Lawrence,1995-08-02,"Johnson, Jones and Harris",glen98@bryant-marshall.co.uk,92 Ann drive\nJoanborough\nSK0 6LR,United Kingdom,GB,+44(0)121 4960340,2016-12-16,fc461df4-b919-48b2-909e-55c95a03fe6b
3,3,Darren,Hussain,1972-09-23,Wheeler LLC,daniellebryan@thompson.org,19 Robinson meadow\nNew Tracy\nW22 2QG,United Kingdom,GB,(0306) 999 0871,2004-02-23,6104719f-ef14-4b09-bf04-fb0c4620acb0
4,4,Garry,Stone,1952-12-20,Warner Inc,billy14@long-warren.com,3 White pass\nHunterborough\nNN96 4UE,United Kingdom,GB,0121 496 0225,2006-09-01,9523a6d3-b2dd-4670-a51a-36aebc89f579
...,...,...,...,...,...,...,...,...,...,...,...,...
15315,14913,Stephen,Jenkins,1943-08-09,"Thornton, Carroll and Newman",s.jenkins@smith.com,Studio 41I\nJones lodge\nOliviaborough\nE8 3DU,United Kingdom,GB,+44(0)292018946,2016-04-15,2bd3a12f-a92d-4cdd-b99c-fc70572db302
15316,14994,Stephen,Smith,1948-08-20,Robinson-Harris,s.smith@smith.com,530 Young parkway\nMillsfurt\nL4G 7NX,United Kingdom,GB,+44(0)1144960977,2020-07-20,d234c04b-c07c-46a5-a902-526f91478ecc
15317,15012,Stephen,Losekann,1940-10-09,Rosenow,s.losekann@smith.com,Viviane-Fritsch-Straße 3/5\n15064 Bad Liebenwerda,Germany,DE,02984 08192,2021-03-07,1a0a8b7b-7c17-42d8-a946-8a85d5495651
15318,15269,Stephen,Rivera,1952-06-04,"Taylor, Fry and Jones",s.rivera@smith.com,"660 Ross Falls Suite 357\nAnthonymouth, MA 09610",United States,US,239.711.3836,2011-01-03,187fe06e-bd5f-4381-af2f-d7ac37ca7572


after dropping rows: 15284


In [50]:
#this is just checking text 
print('this is the new_df country values:')
print(cleaned_country_values_and_codes_df['country'].unique()) 

print('this is new_df country_code:')
print(cleaned_country_values_and_codes_df['country_code'].unique()) 

this is the new_df country values:
['Germany' 'United Kingdom' 'United States']
this is new_df country_code:
['DE' 'GB' 'US']


In [51]:
#cleaning date_of_birth
cleaned_date_df = cleaned_country_values_and_codes_df.copy()

# get reference from before the conversion so I can check if it works 
original_date_of_birth_example = cleaned_date_df.iloc[360]['date_of_birth']
print('original date_of_birth before pd.to_datetime():\n', original_date_of_birth_example)

# Convert the 'date_of_birth' column to datetime, handling various formats
cleaned_date_df['date_of_birth'] = pd.to_datetime(cleaned_date_df['date_of_birth'], errors='coerce', format='mixed')

# Check which dates could not be converted
invalid_dates = cleaned_date_df[cleaned_date_df['date_of_birth'].isna()]
print('number of rows that convert to NaT', len(invalid_dates)) 
display('these are the invalid dates:', invalid_dates)

# Check the conversion worked by printing an example of a difficult date

date_of_birth_converted_example = cleaned_date_df.iloc[360]['date_of_birth']
print('again, the original date_of_birth before pd.to_datetime():\n', original_date_of_birth_example)
print('date_of_birth after pd.to_datetime:\n', date_of_birth_converted_example)


original date_of_birth before pd.to_datetime():
 1968 October 16
number of rows that convert to NaT 0


'these are the invalid dates:'

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid


again, the original date_of_birth before pd.to_datetime():
 1968 October 16
date_of_birth after pd.to_datetime:
 1968-10-16 00:00:00


In [52]:
#cleaning join_date 
cleaned_join_date_df = cleaned_date_df.copy()

#get a reference date to check it's worked 
original_join_date_example = cleaned_join_date_df.iloc[202]['join_date']
print('original join_date:\n', original_join_date_example)

# Convert the join_date column to datetime, handling various formats
cleaned_join_date_df['join_date'] = pd.to_datetime(cleaned_join_date_df['join_date'], errors='coerce', format='mixed') 

# Check which dates could not be converted
invalid_dates = cleaned_join_date_df[cleaned_join_date_df['join_date'].isna()]
print('number of rows that convert to NaT', len(invalid_dates)) 
display('these are the invalid dates:', invalid_dates)

# Check the conversion worked by printing an example of a difficult date
join_date_converted_example = cleaned_join_date_df.iloc[202]['join_date']
print('original join_date:\n', original_join_date_example)
print('join_date after pd.to_datetime:\n', join_date_converted_example)

original join_date:
 2006 September 03
number of rows that convert to NaT 0


'these are the invalid dates:'

Unnamed: 0,index,first_name,last_name,date_of_birth,company,email_address,address,country,country_code,phone_number,join_date,user_uuid


original join_date:
 2006 September 03
join_date after pd.to_datetime:
 2006-09-03 00:00:00


In [53]:
#lower case text fields 
lower_case_df = cleaned_join_date_df.copy()

#getting an example 
before_lower_case_example = lower_case_df.iloc[0]['first_name']
print('this is the name before transformation:', before_lower_case_example)

#convert it 
lower_case_df['first_name'] = lower_case_df['first_name'].str.lower()
lower_case_df['last_name'] = lower_case_df['last_name'].str.lower()

# stripping whitespace 
lower_case_df['first_name'] = lower_case_df['first_name'].str.strip()
lower_case_df['last_name'] = lower_case_df['last_name'].str.strip()

#dropping duplicates 
print('row count before drop duplicates', len(lower_case_df)) 
lower_case_df = lower_case_df.drop_duplicates()
print('row count after drop duplicates', len(lower_case_df)) 

#cleaning special characters and punctuation from addresses 
#address_before_transformation = lower_case_df.iloc[0]['address']
#print('this is address before transformation', address_before_transformation)
#lower_case_df['address'] = lower_case_df['address'].str.replace('ß', 's')
#lower_case_df['address_converted'] = lower_case_df['address'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
#address_after_transformation = lower_case_df.iloc[0]['address_converted']
#print('this is address after transformation', address_after_transformation)

#print before and after examples to check it worked 
after_lower_case_example = lower_case_df.iloc[0]['first_name']
print('again, this is the name before transformation:', before_lower_case_example)
print('this is the name after transformation:', after_lower_case_example)

this is the name before transformation: Sigfried
row count before drop duplicates 15284
row count after drop duplicates 15284
again, this is the name before transformation: Sigfried
this is the name after transformation: sigfried
