In [2]:
import pandas as pd
import logging 
import numpy as np
import re 
from IPython.display import display
from sqlalchemy import MetaData, Table
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from unidecode import unidecode
from dateutil import parser

In [3]:
#this is ok and is combined with the data_cleaning file 

In [4]:
 #LOGGING start of method 
print('started convert_weights_to_kg')

#creating instance of dataextractor 
instance = DataExtractor()
    
#retrieving the data from the stores API
df = instance.extract_from_s3('s3://data-handling-public/products.csv') 
                
        


started convert_weights_to_kg
extract_from_s3 is working


In [5]:
#making a back up of the dataframe 
df_backup = df.copy()

In [6]:
# run this to reset the df 
df = df_backup.copy()

In [7]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1853 entries, 0 to 1852
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     1853 non-null   int64 
 1   product_name   1849 non-null   object
 2   product_price  1849 non-null   object
 3   weight         1849 non-null   object
 4   category       1849 non-null   object
 5   EAN            1849 non-null   object
 6   date_added     1849 non-null   object
 7   uuid           1849 non-null   object
 8   removed        1849 non-null   object
 9   product_code   1849 non-null   object
dtypes: int64(1), object(9)
memory usage: 144.9+ KB


None

In [8]:
display(df['removed'].unique())

array(['Still_avaliable', 'Removed', nan, 'T3QRRH7SRP', 'BPSADIOQOK',
       'H5N71TV8AY'], dtype=object)

In [9]:
nan_rows = df.loc[df['removed'] == 'nan']

display(nan_rows) 
#na_rows = df.loc[df['lat'] == 'N/A']

Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code


In [10]:
df = df_backup.copy()


num_rows = df.shape[0]
display(f"Number of rows before cleaning: {num_rows}")
# Define the regex pattern to match numbers and letters
weight_pattern = re.compile(r'([0-9.]+)([a-zA-Z]+)')

# Define the regex pattern to match the weight multiplier format
weight_multiplier = re.compile(r'^(\d+)\s*x\s*(\d+)([a-zA-Z]+)$')

# Conversion factors to kg
conversion_factors = {
    'kg': 1,
    'g': 0.001,
    'oz': 0.0283495231,
    'ml': 0.001  # Assuming ml is equivalent to grams for water-based products
}

def convert_to_kg(weight):
    if pd.isna(weight):
        return None  # Handle NaN values
    
    match1 = weight_pattern.match(str(weight))
    match2 = weight_multiplier.match(str(weight))
    
    if match1:
        number = float(match1.group(1))  # Extract the numeric part
        unit = match1.group(2).lower()  # Extract the unit part
        return number * conversion_factors.get(unit, 0)  # Convert to kg
    
    elif match2:
        multiplier = int(match2.group(1))  # Get the multiplier
        amount = float(match2.group(2))  # Extract the amount
        unit = match2.group(3).lower()  # Extract the unit
        return (multiplier * amount) * conversion_factors.get(unit, 0)  # Convert to kg
    
    return None  # Handle cases where regex does not match

# Apply the conversion to the 'weight' column
df['weight_in_kg'] = df['weight'].apply(convert_to_kg)

num_rows = df.shape[0]
display(f"Number of rows after conversion: {num_rows}")

# Correct the misspelled value
#df['removed'] = df['removed'].replace('Still_avaliable', 'Still_available')
df['removed'] = df['removed'].str.replace('Still_avaliable', 'Still_available')

num_rows = df.shape[0]
display(f"Number of rows after replace still available: {num_rows}")

display(df['removed'].unique())

# Filter the DataFrame to keep only 'Still_available' or 'Removed'
# Correct the misspelled value using str.replace

# Define valid values
valid_values = ['Still_available', 'Removed']

# Identify rows that will be removed
rows_to_remove = df[~df['removed'].isin(valid_values)]

# Display the list of items that will be removed
print("Items to be removed:")
print(rows_to_remove)

# Filter the DataFrame to keep only 'Still_available' or 'Removed'
df = df[df['removed'].isin(valid_values)]


num_rows = df.shape[0]
display(f"Number of rows after cleaning of removed column: {num_rows}")


# Capture rows with NaN values before dropping them
rows_with_na = df[df.isna().any(axis=1)]
display("Rows with NaN values:")
display(rows_with_na)

# Drop rows with NaN values
df = df.dropna(axis=0)

num_rows = df.shape[0]
display(f"Number of rows after dropna: {num_rows}")

# Define the regex pattern for the 'category' column
category_pattern = re.compile(r'^[a-zA-Z\-]+$')

# Capture rows that will be dropped by the category filter
rows_dropped_by_category = df[~df['category'].apply(lambda x: bool(category_pattern.match(str(x))))]

display("Rows that will be dropped by category filter:")
display(rows_dropped_by_category)

# Filter rows based on the category pattern
df = df[df['category'].apply(lambda x: bool(category_pattern.match(str(x))))]

num_rows = df.shape[0]
display(f"Number of rows after category filter: {num_rows}")


# STEP 5, cleaning date_added 

num_rows = df.shape[0]
print(f"Number of rows before opening_date cleaning: {num_rows}")

# Initialize a list to store invalid dates
invalid_dates_list = []

# Function to parse dates and standardize format
def parse_date(date_str):
    try:
        # Attempt to parse the date string to a datetime object
        dt = parser.parse(date_str)
        # Convert to the desired format (YYYY-MM-DD)
        return dt.strftime('%Y-%m-%d')
    except (parser.ParserError, ValueError):
        # Append invalid date to the list
        invalid_dates_list.append(date_str)
        return np.nan  # Return NaN for invalid dates

# Apply the function to the 'date_of_birth' column
df['date_added'] = df['date_added'].apply(parse_date)

# Identify rows that would be null after conversion
invalid_rows = df[df['date_added'].isna()]

display("Rows that would be converted to NULL:")
display(invalid_rows)

# Drop rows with NaN (invalid dates)
df_cleaned = df.dropna(subset=['date_added'])

display("\nList of invalid dates:")
display(invalid_dates_list)

num_rows = df.shape[0]
print(f"Number of rows after date_added cleaning: {num_rows}")

num_rows = df.shape[0]
print(f"Number of rows product data after cleaning: {num_rows}")


'Number of rows before cleaning: 1853'

'Number of rows after conversion: 1853'

'Number of rows after replace still available: 1853'

array(['Still_available', 'Removed', nan, 'T3QRRH7SRP', 'BPSADIOQOK',
       'H5N71TV8AY'], dtype=object)

Items to be removed:
      Unnamed: 0 product_name product_price      weight    category  \
266          266          NaN           NaN         NaN         NaN   
751          751   VLPCU81M30    XCD69KUI0K  9GO9NZ5JTL  S1YB74MLMJ   
788          788          NaN           NaN         NaN         NaN   
794          794          NaN           NaN         NaN         NaN   
1133        1133   9SX4G65YUX    N9D2BZQX63  Z8ZTDGUZVU  C3NCA2CL35   
1400        1400   LB3D71C025    ODPMASE7V7  MX180RYSHX  WVPMHZP59U   
1660        1660          NaN           NaN         NaN         NaN   

             EAN  date_added        uuid     removed product_code  \
266          NaN         NaN         NaN         NaN          NaN   
751   OO7KH8P79I  CCAVRB79VV  7QB0Z9EW1G  T3QRRH7SRP   SDAV678FVD   
788          NaN         NaN         NaN         NaN          NaN   
794          NaN         NaN         NaN         NaN          NaN   
1133  E8EOGWOY8S  09KREHTMWL  CP8XYQVGGU  BPSADIOQOK  BSDTR67VD90

'Number of rows after cleaning of removed column: 1846'

'Rows with NaN values:'

Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code,weight_in_kg


'Number of rows after dropna: 1846'

'Rows that will be dropped by category filter:'

Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code,weight_in_kg


'Number of rows after category filter: 1846'

Number of rows before opening_date cleaning: 1846


'Rows that would be converted to NULL:'

Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code,weight_in_kg


'\nList of invalid dates:'

[]

Number of rows after date_added cleaning: 1846
Number of rows product data after cleaning: 1846
