In [152]:
import boto3
import pandas as pd
from io import StringIO
from io import BytesIO
import re


In [153]:
def extract_from_s3(s3_uri):
    
    #extract the bucket name and key from the URI 
    uri = re.match(r's3://([^/]+)/(.+)', s3_uri)
    if not uri:
        raise ValueError(f"Invalid S3 URI: {s3_uri}")
    bucket = uri.group(1)
    key = uri.group(2)
    #print('bucket name is:', bucket, '; key name is', key)
    
    #create a client (like a customer service representative going to talk to the S3 dept for me)
    s3 = boto3.client('s3')
 
    # Using a context manager to handle the BytesIO buffer
    with BytesIO() as buffer:
        # Download the file from S3 into the buffer
        s3.download_fileobj(bucket, key, buffer)
    
        # Move to the beginning of the buffer to read its content
        buffer.seek(0)
        
        # Load the data into a pandas DataFrame
        df = pd.read_csv(buffer)
        
    return df 

df = extract_from_s3('s3://data-handling-public/products.csv')

display(df)

Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code
0,0,FurReal Dazzlin' Dimples My Playful Dolphin,£39.99,1.6kg,toys-and-games,7425710935115,2005-12-02,83dc0a69-f96f-4c34-bcb7-928acae19a94,Still_avaliable,R7-3126933h
1,1,Tiffany's World Day Out At The Park,£12.99,0.48kg,toys-and-games,487128731892,2006-01-09,712254d7-aea7-4310-aff8-8bcdd0aec7ff,Still_avaliable,C2-7287916l
2,2,Tiffany's World Pups Picnic Playset,£7.00,590g,toys-and-games,1945816904649,1997-03-29,b089ef6f-b628-4e37-811d-fffe0102ba64,Still_avaliable,S7-1175877v
3,3,Tiffany's World Wildlife Park Adventures,£12.99,540g,toys-and-games,1569790890899,2013-03-20,d55de422-8b98-47d6-9991-e4bc4c5c0cb0,Removed,D8-8421505n
4,4,Cosatto Cosy Dolls Pram,£30.00,1.91kg,toys-and-games,7142740213920,2007-12-23,7945b657-cb02-4cc5-96cf-f65ed0a8f235,Still_avaliable,B6-2596063a
...,...,...,...,...,...,...,...,...,...,...
1848,1848,Goodmans Qi Autosense In Car Phone Holder,£15.00,125g,diy,8185827751600,2020-08-28,3ff21cb1-8e90-499d-b5a0-6839fb83e70f,Still_avaliable,x4-2227164k
1849,1849,Goodmans Qi Wireless Charging Phone Stand,£15.00,134g,diy,9769724533935,2012-05-20,a7910f74-c2b6-4144-98b7-30ca68f34d2d,Still_avaliable,i3-8338545Z
1850,1850,RAC 12V Wet & Dry Vacuum,£18.00,820g,diy,3113855860327,1993-11-28,66d3efa0-f480-4da9-893b-e8aaff53e27b,Still_avaliable,n4-1642658b
1851,1851,Goodmans Qi Autosense In Car Phone Holder,£15.00,125g,diy,9195087467400,2016-09-20,efb472f1-8f00-448e-a874-9fa9812eeab5,Still_avaliable,A3-7619070S


In [154]:

df['weight'].unique()


    

array(['1.6kg', '0.48kg', '590g', '540g', '1.91kg', '0.91kg', '0.46kg',
       '0.38kg', '8.981kg', '1.478kg', '1.2g', '0.66kg', '1.8kg', '1.9kg',
       '1.725kg', '0.54kg', '0.322kg', '0.71kg', '0.88kg', '0.67kg',
       '11.076kg', '4kg', '0.385kg', '1.38kg', '2.57kg', '1.35kg',
       '0.695kg', '1.15kg', '0.98kg', '1.447kg', '1.3625kg', '2.25kg',
       '0.79kg', '0.8kg', '1.08kg', '2.476kg', '0.137kg', '11.5kg',
       '0.44kg', '2.75kg', '0.911kg', '0.33kg', '1kg', '0.5kg', '0.45kg',
       '0.7kg', '0.41kg', '1.3kg', '2kg', '0.34kg', '0.37kg', '0.76kg',
       '1.18kg', '0.685kg', '1.59kg', '1.4kg', '1.66kg', '13.5kg',
       '0.745kg', '1.44kg', '0.74kg', '0.660kg', '0.419kg', '0.418kg',
       '0.470kg', '0.353kg', '0.350kg', '0.96kg', '1.20kg', '1.21kg',
       '1.02kg', '0.365kg', '0.677kg', '0.55kg', '0.43kg', '0.11kg',
       '1.23kg', '1.03kg', '0.87kg', '0.39kg', '0.35kg', '0.42kg',
       '0.27kg', '726g', '0.61kg', '0.864kg', '0.667kg', '0.63kg',
       '0.72kg', '0.5

In [155]:
import re
import pandas as pd



def convert_weights_to_kg(): #should contain self
    # Define the regex pattern to match numbers and letters
    pattern = re.compile(r'([0-9.]+)([a-zA-Z]+)')

    # Conversion factors to kg
    conversion_factors = {
        'kg': 1,
        'g': 0.001,
        'oz': 0.0283495231,
        'ml': 0.001  # Assuming ml is equivalent to grams for water-based products
    }

    def convert_to_kg(weight):
        if pd.isna(weight):
            return None  # Handle NaN values
        match = pattern.match(str(weight))
        if match:
            number = float(match.group(1))  # Extract the numeric part
            unit = match.group(2).lower()  # Extract the unit part
            return number * conversion_factors.get(unit, 0)  # Convert to kg
        return None  # Handle cases where regex does not match

    # Apply the conversion to the 'weights' column
    df['weight_in_kg'] = df['weight'].apply(convert_to_kg) # self.df['weight_in_kg'] = self.df['weights'].apply(convert_to_kg)


convert_weights_to_kg()

# Display the updated dataframe
display(df)





Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code,weight_in_kg
0,0,FurReal Dazzlin' Dimples My Playful Dolphin,£39.99,1.6kg,toys-and-games,7425710935115,2005-12-02,83dc0a69-f96f-4c34-bcb7-928acae19a94,Still_avaliable,R7-3126933h,1.600
1,1,Tiffany's World Day Out At The Park,£12.99,0.48kg,toys-and-games,487128731892,2006-01-09,712254d7-aea7-4310-aff8-8bcdd0aec7ff,Still_avaliable,C2-7287916l,0.480
2,2,Tiffany's World Pups Picnic Playset,£7.00,590g,toys-and-games,1945816904649,1997-03-29,b089ef6f-b628-4e37-811d-fffe0102ba64,Still_avaliable,S7-1175877v,0.590
3,3,Tiffany's World Wildlife Park Adventures,£12.99,540g,toys-and-games,1569790890899,2013-03-20,d55de422-8b98-47d6-9991-e4bc4c5c0cb0,Removed,D8-8421505n,0.540
4,4,Cosatto Cosy Dolls Pram,£30.00,1.91kg,toys-and-games,7142740213920,2007-12-23,7945b657-cb02-4cc5-96cf-f65ed0a8f235,Still_avaliable,B6-2596063a,1.910
...,...,...,...,...,...,...,...,...,...,...,...
1848,1848,Goodmans Qi Autosense In Car Phone Holder,£15.00,125g,diy,8185827751600,2020-08-28,3ff21cb1-8e90-499d-b5a0-6839fb83e70f,Still_avaliable,x4-2227164k,0.125
1849,1849,Goodmans Qi Wireless Charging Phone Stand,£15.00,134g,diy,9769724533935,2012-05-20,a7910f74-c2b6-4144-98b7-30ca68f34d2d,Still_avaliable,i3-8338545Z,0.134
1850,1850,RAC 12V Wet & Dry Vacuum,£18.00,820g,diy,3113855860327,1993-11-28,66d3efa0-f480-4da9-893b-e8aaff53e27b,Still_avaliable,n4-1642658b,0.820
1851,1851,Goodmans Qi Autosense In Car Phone Holder,£15.00,125g,diy,9195087467400,2016-09-20,efb472f1-8f00-448e-a874-9fa9812eeab5,Still_avaliable,A3-7619070S,0.125


In [156]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1853 entries, 0 to 1852
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1853 non-null   int64  
 1   product_name   1849 non-null   object 
 2   product_price  1849 non-null   object 
 3   weight         1849 non-null   object 
 4   category       1849 non-null   object 
 5   EAN            1849 non-null   object 
 6   date_added     1849 non-null   object 
 7   uuid           1849 non-null   object 
 8   removed        1849 non-null   object 
 9   product_code   1849 non-null   object 
 10  weight_in_kg   1819 non-null   float64
dtypes: float64(1), int64(1), object(9)
memory usage: 159.4+ KB


In [157]:
nan_count = df['weight_in_kg'].isna().sum()
print("Number of NaN values in 'weight_in_kg':", nan_count)

# Show the rows with NaN values in 'weight_in_kg'
nan_rows = df[df['weight_in_kg'].isna()]
print("Rows with NaN values in 'weight_in_kg':")
display(nan_rows)

Number of NaN values in 'weight_in_kg': 34
Rows with NaN values in 'weight_in_kg':


Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code,weight_in_kg
266,266,,,,,,,,,,
298,298,Pedigree Vital Protection Dog Food in Jelly 12...,£4.49,12 x 100g,pets,2439834307647,1995-06-25,5ec5a431-7330-4d9e-bf3c-702fc85f6efe,Still_avaliable,d4-9698287C,
299,299,Cesar Country Kitchen Dog Food 8 x 150g,£5.49,8 x 150g,pets,5158002633117,1994-03-22,c379d810-61bf-4ed2-994c-0d3308d5be7b,Still_avaliable,M4-1688574Q,
300,300,Pedigree Puppy Dog Food in Jelly 12pk,£4.49,12 x 100g,pets,5242405808651,2014-05-20,40963728-bdae-48f1-beec-7c6dd0225921,Still_avaliable,J4-5517838n,
301,301,Pedigree Dog Food in Gravy 12pk,£4.49,12 x 100g,pets,6160542808008,2020-12-20,3109d9e8-56b5-4233-a682-0bcdb38182b0,Still_avaliable,v6-1209149g,
302,302,Pedigree Dog Food in Jelly 12pk,£4.49,12 x 100g,pets,3928398867180,2006-03-27,28625483-3723-4eaa-9f14-a1a16affbed8,Still_avaliable,C5-0179057G,
303,303,Chappie Dog Food 6 x 412g,£5.99,6 x 412g,pets,8155594260063,1993-06-30,6b86f83c-fccf-42eb-b1d5-3971915c18a5,Still_avaliable,Z4-535642u,
304,304,Butcher's Puppy Meaty Chunks in Jelly Tins 6 x...,£4.35,6 x 400g,pets,9386802556862,2019-06-16,a21e8202-de56-40bf-aee9-1b65989b1030,Still_avaliable,x0-3876046p,
305,305,Pedigree Vital Protection Dog Food in Jelly 12...,£4.49,12 x 100g,pets,6045981116269,1996-10-12,39ce65ab-d873-4953-a363-af9a705ca1ef,Still_avaliable,O3-6314460b,
381,381,Sheba Classics Ocean Selection in Terrine 8 x 85g,£3.49,8 x 85g,pets,5864355297349,2014-01-01,96f4d10f-1cb0-4898-b170-8dcd54c78f05,Still_avaliable,I3-7208040s,


In [158]:
df['removed'].unique() 

array(['Still_avaliable', 'Removed', nan, 'T3QRRH7SRP', 'BPSADIOQOK',
       'H5N71TV8AY'], dtype=object)

In [159]:
#cleaning df 

def clean_products_data(df): 
    # Drop rows with any NaN values
    df = df.dropna()

    # Drop duplicate rows
    df = df.drop_duplicates()

    # Define the regex pattern to match valid product types
    pattern = re.compile(r'^[a-zA-Z-]+$')

    # Function to check if a value matches the expected product type format
    def is_valid_product_type(column_name):
        if pd.isna(column_name):
            return False
        return bool(pattern.match(str(column_name)))

    #check function is working 
    print('this is removed before applying function', df['removed'].unique()) 
    # Create a boolean mask for valid product types
    valid_product_type_mask = df['category'].apply(is_valid_product_type)

    # Drop rows with non-nonsensical product types
    df = df[valid_product_type_mask].reset_index(drop=True)

    print('this is removed after applying function', df['removed'].unique())

    return df

# Clean the DataFrame
df = clean_products_data(df)
display(df)

    
   


this is removed before applying function ['Still_avaliable' 'Removed' 'T3QRRH7SRP']
this is removed after applying function ['Still_avaliable' 'Removed']


Unnamed: 0.1,Unnamed: 0,product_name,product_price,weight,category,EAN,date_added,uuid,removed,product_code,weight_in_kg
0,0,FurReal Dazzlin' Dimples My Playful Dolphin,£39.99,1.6kg,toys-and-games,7425710935115,2005-12-02,83dc0a69-f96f-4c34-bcb7-928acae19a94,Still_avaliable,R7-3126933h,1.600
1,1,Tiffany's World Day Out At The Park,£12.99,0.48kg,toys-and-games,487128731892,2006-01-09,712254d7-aea7-4310-aff8-8bcdd0aec7ff,Still_avaliable,C2-7287916l,0.480
2,2,Tiffany's World Pups Picnic Playset,£7.00,590g,toys-and-games,1945816904649,1997-03-29,b089ef6f-b628-4e37-811d-fffe0102ba64,Still_avaliable,S7-1175877v,0.590
3,3,Tiffany's World Wildlife Park Adventures,£12.99,540g,toys-and-games,1569790890899,2013-03-20,d55de422-8b98-47d6-9991-e4bc4c5c0cb0,Removed,D8-8421505n,0.540
4,4,Cosatto Cosy Dolls Pram,£30.00,1.91kg,toys-and-games,7142740213920,2007-12-23,7945b657-cb02-4cc5-96cf-f65ed0a8f235,Still_avaliable,B6-2596063a,1.910
...,...,...,...,...,...,...,...,...,...,...,...
1813,1848,Goodmans Qi Autosense In Car Phone Holder,£15.00,125g,diy,8185827751600,2020-08-28,3ff21cb1-8e90-499d-b5a0-6839fb83e70f,Still_avaliable,x4-2227164k,0.125
1814,1849,Goodmans Qi Wireless Charging Phone Stand,£15.00,134g,diy,9769724533935,2012-05-20,a7910f74-c2b6-4144-98b7-30ca68f34d2d,Still_avaliable,i3-8338545Z,0.134
1815,1850,RAC 12V Wet & Dry Vacuum,£18.00,820g,diy,3113855860327,1993-11-28,66d3efa0-f480-4da9-893b-e8aaff53e27b,Still_avaliable,n4-1642658b,0.820
1816,1851,Goodmans Qi Autosense In Car Phone Holder,£15.00,125g,diy,9195087467400,2016-09-20,efb472f1-8f00-448e-a874-9fa9812eeab5,Still_avaliable,A3-7619070S,0.125


In [160]:
df['removed'].unique() 

array(['Still_avaliable', 'Removed'], dtype=object)

In [161]:
df['category'].unique() 

array(['toys-and-games', 'sports-and-leisure', 'pets', 'homeware',
       'health-and-beauty', 'food-and-drink', 'diy'], dtype=object)