In [8]:
from database_utils import DatabaseConnector
from data_extraction import DataExtractor
from urllib.parse import urlparse
import boto3
import pandas as pd
from io import StringIO
from io import BytesIO
import re 

In [9]:
def extract_from_s3(uri):
    print('started extract from self')
    
    parsed_url = urlparse(uri)
    
    if parsed_url.scheme == 'https':
        bucket = parsed_url.netloc.split('.')[0]
        key = parsed_url.path.lstrip('/')
    elif parsed_url.scheme == 's3':
        bucket = parsed_url.netloc
        key = parsed_url.path.lstrip('/')
    else:
        raise ValueError(f"Invalid URI scheme: {parsed_url.scheme}")
    
    s3 = boto3.client('s3')
    
    # Check the file extension to determine the format
    file_extension = key.split('.')[-1].lower()
    
    with BytesIO() as buffer:
        s3.download_fileobj(bucket, key, buffer)
        buffer.seek(0)
        
        if file_extension == 'csv':
            df = pd.read_csv(buffer)
        elif file_extension == 'json':
            df = pd.read_json(buffer)
        else:
            raise ValueError(f"Unsupported file extension: {file_extension}")
    
    print('ran extract_from_self')
    return df

In [10]:
df = extract_from_s3('https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json')
display(df)

started extract from self
ran extract_from_self


Unnamed: 0,timestamp,month,year,day,time_period,date_uuid
0,22:00:06,9,2012,19,Evening,3b7ca996-37f9-433f-b6d0-ce8391b615ad
1,22:44:06,2,1997,10,Evening,adc86836-6c35-49ca-bb0d-65b6507a00fa
2,10:05:37,4,1994,15,Morning,5ff791bf-d8e0-4f86-8ceb-c7b60bef9b31
3,17:29:27,11,2001,6,Midday,1b01fcef-5ab9-404c-b0d4-1e75a0bd19d8
4,22:40:33,12,2015,31,Evening,dfa907c1-f6c5-40f0-aa0d-40ed77ac5a44
...,...,...,...,...,...,...
120156,22:56:56,11,2022,12,Evening,d6c4fb31-720d-4e94-aa6b-dcbcb85f2bb7
120157,18:25:20,5,1997,31,Evening,f7722027-1aae-49c3-8f8d-853e93f9f3e6
120158,18:21:40,9,2011,13,Evening,4a3b9851-52e1-463c-ac81-1960f141444e
120159,19:10:53,7,2013,12,Evening,64974909-0d4b-42a2-822a-73b5695e8bfb


In [11]:
num_rows = df.shape[0]
print(f"Number of rows date times before year cleaning: {num_rows}")



Number of rows date times before year cleaning: 120161


In [12]:
print(df['year'].unique()) 
print(df['month'].unique()) 
print(df['day'].unique()) 
print(df['time_period'].unique()) 

['2012' '1997' '1994' '2001' '2015' '2002' '1993' '2006' '2004' '2008'
 '2021' '2018' '2009' '2020' '2017' '2019' '2000' '2007' '2013' '2010'
 '1995' '2005' '1999' '2003' '1996' '2014' '2022' '2016' '1998' '2011'
 '1992' 'FTKRTQHFZE' '14NRQ80L5E' 'NULL' '33F45BZPSP' 'G3DEZY8UW6'
 '0M8BGI0CI3' 'O17F6WE1TD' '9DKC6PW41E' 'I5367BRUVN' 'EB2N507OZ0'
 'KO7BGRPOKH' 'RA8D4CIQOV' 'FXC3K5LZZX' 'FIEOPTN7WZ' 'AQLUVY7DA2'
 '9QDY0WMH6K' '5RZL03AWR6' 'QF6S8TDTEA' 'L1N4X0SVZA' 'OVDJZCARJA'
 'UDHIYJS2GP' 'LND1WX0Y6Z' '3GJWN253MM' 'M9ZV3N8G95']
['9' '2' '4' '11' '12' '8' '1' '3' '7' '10' '5' '6' '1YMRDJNU2T'
 '9GN4VIO5A8' 'NULL' 'NF46JOZMTA' 'LZLLPZ0ZUA' 'YULO5U0ZAM' 'SAT4V9O2DL'
 '3ZZ5UCZR5D' 'DGQAH7M1HQ' '4FHLELF101' '22JSMNGJCU' 'EB8VJHYZLE'
 '2VZEREEIKB' 'K9ZN06ZS1X' '9P3C0WBWTU' 'W6FT760O2B' 'DOIR43VTCM'
 'FA8KD82QH3' '03T414PVFI' 'FNPZFYI489' '67RMH5U2R6' 'J9VQLERJQO'
 'ZRH2YT3FR8' 'GYSATSCN88']
['19' '10' '15' '6' '31' '2' '14' '1' '21' '3' '16' '25' '17' '12' '23'
 '26' '24' '18' '4' '11' '22' '7

In [13]:
# filtering out items in locality that aren't real place names or NULL 
year_regex = r'^\d{4}$'
df = df[df['year'].str.match(year_regex)]


In [14]:
num_rows = df.shape[0]
print(f"Number of rows date times after year cleaning: {num_rows}")



Number of rows date times after year cleaning: 120123


In [15]:
print(df['year'].unique()) 
print(df['month'].unique()) 
print(df['day'].unique()) 
print(df['time_period'].unique()) 

['2012' '1997' '1994' '2001' '2015' '2002' '1993' '2006' '2004' '2008'
 '2021' '2018' '2009' '2020' '2017' '2019' '2000' '2007' '2013' '2010'
 '1995' '2005' '1999' '2003' '1996' '2014' '2022' '2016' '1998' '2011'
 '1992']
['9' '2' '4' '11' '12' '8' '1' '3' '7' '10' '5' '6']
['19' '10' '15' '6' '31' '2' '14' '1' '21' '3' '16' '25' '17' '12' '23'
 '26' '24' '18' '4' '11' '22' '7' '30' '8' '13' '27' '20' '29' '9' '28'
 '5']
['Evening' 'Morning' 'Midday' 'Late_Hours']


In [17]:
df['complete_timestamp'] = pd.to_datetime(df['year'] + '-' + df['month'] + '-' + df['day'] + ' ' + df['timestamp'], format='%Y-%m-%d %H:%M:%S')


In [19]:
num_rows = df.shape[0]
print(f"Number of rows date times after adding 'complete_timestamp' coversion: {num_rows}")



Number of rows date times after adding 'complete_timestamp' coversion: 120123


In [22]:
missing_count = df['complete_timestamp'].isnull().sum()
total_rows = df.shape[0]

print(f"Total rows: {total_rows}")
print(f"Number of missing/invalid dates: {missing_count}")

Total rows: 120123
Number of missing/invalid dates: 0


In [23]:
# Check for any 'NaT' (Not a Time) values
nat_values = df['complete_timestamp'].isna()
missing_values = df['complete_timestamp'].isnull()

# Display rows with 'NaT' values
df_nat = df[nat_values]
df_missing = df[missing_values]


print("Number of 'NaT' values:", df_nat.shape[0])
print(df_nat)

print("Number of missing values:", df_missing.shape[0])
print(df_missing)


Number of 'NaT' values: 0
Empty DataFrame
Columns: [timestamp, month, year, day, time_period, date_uuid, complete_timestamp]
Index: []
Number of missing values: 0
Empty DataFrame
Columns: [timestamp, month, year, day, time_period, date_uuid, complete_timestamp]
Index: []
