# ETL (Extract Transform Load)

#### This notebook contains method to extract, transform and load the sanctions datasets. This code will be written in ".py" and loaded to a server later to enable the code to be run automatically

#

## Extract

#### For the extracation the following will be implemented:
 - Check if the file needed is present
   - If not present, download the file and append the file's date to the file (we can retrieve the date downloaded by querying the modified time as the file will only be modified once)
 - If it is present, check if the files are older than 7 days. If they are check for new files and download if available
 
 
#### The following are the naming structure that will be adopted for the files:
 - regime.json -- will be called --> regimes_{date}.json
 - 20230607-FULL-1_1.csv -- will be called --> person_two_{date}.csv
 - 20230607-FULL-1_0.csv -- will be called --> person_one_{date}.csv

#

# Extraction Code

#### The code below can be used to extract the data required (Currently, it will only extract if the current data is older than 7 days)

In [1]:
import glob
from datetime import date, datetime
from dateutil.relativedelta import relativedelta
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
from enum import Enum


class Peoples(Enum):
    regime = 'regime'
    person_one = 'person_one'
    person_two = 'person_two'


class Extract():
    
    def __init__(self):
        self.regimes_url = 'https://www.sanctionsmap.eu/api/v1/regime'
        self.persons_url = 'https://webgate.ec.europa.eu/fsd/fsf/public/rss'
        
    
    
    def latest_file(self, files_list):
        latest_file = files_list
        if isinstance(files_list, list):
            # create lists containing only modified time and file name
            files_list = [[os.path.getmtime(file), file] for file in latest_file]
            latest_file = max(files_list)[1]
        return latest_file

    
    def get_file_date(self, file):
        file_date = os.path.getmtime(file)
        file_date = date.fromtimestamp(file_date)
        return file_date
        
    
    def is_file_current(self, file):
        file_date = self.get_file_date(file)
        check_date = date.today() + relativedelta(days=-7)
        if file_date  < check_date:
            return False
        return True
            
        
    def check_files(self, files_list):
        latest_file = files_list
        if len(files_list) >= 1:
            latest_file = self.latest_file(files_list)
            if self.is_file_current(latest_file):
                return (False, latest_file)        
        # No files available/No current files (prompts download)
        return (True, latest_file)
 

    def create_file_names_with_dates(self):
        file_names = file_names = {f"{Peoples.regime.value}": "json", 
                                   f"{Peoples.person_one.value}":"csv", 
                                   f"{Peoples.person_two.value}":"csv"}
        
        todays_date_str = date.today().strftime('%Y_%m_%d')
        files_todays_date = [f"{file}_{todays_date_str}.{file_names[file]}" 
                             for file in file_names]
        return files_todays_date
    
    
    def get_person_urls(self):
        resp = requests.get(self.persons_url)
        soup = BeautifulSoup(resp.text, 'xml')
        tags = soup.find_all('enclosure', attrs = {'type': 'text/plain'})

        # take only the first 2 (make adjustments if structure changes in future)
        urls = [tag['url'] for tag in tags][:2]
        urls.sort()
        persons = [Peoples.person_one.value, Peoples.person_two.value]
        person_url = list(zip(persons, urls))

        pub_date = str(soup.find('pubDate')).strip('</pubDate>').replace(" GMT", "")
        pub_date = datetime.strptime(pub_date, '%a, %d %b %Y %H:%M:%S')
        return (pub_date, person_url)
                    
        
    def check_published_date(self, pub_date, file_date):
        file_date = "_".join(file_date.split('_')[2:]).split(".")[0]
        file_date = datetime.strptime(file_date, '%Y_%m_%d')
        # No need for 7 or + days check (previous check was done)
        if pub_date > file_date:
            return True
        return False
    
     
    def enable_download(self, person_file, pub_date):
        if len(person_file) >= 1:
                latest_file = self.latest_file(person_file)
                if self.check_published_date(pub_date, latest_file):
                    return True
                else:
                    return False
        else:
            return True
        
        
    def download_file(self, url, dest_file_name):
        # return httpmessage or httperror
        try:
            file_down = urllib.request.urlretrieve(url, dest_file_name)
            return (dest_file_name, file_down[1])
        except Exception as e:
            return (dest_file_name, e)
    
    
    def persons_file(self):
        # regimes is only downloaded if other files are out of date (no checking needed)
        files_needed = [glob.glob(e) for e in [f'{Peoples.person_one.value}*', 
                                               f'{Peoples.person_two.value}*']]
        person_one = files_needed[0]
        person_two = files_needed[1]
        return person_one, person_two
    
        
    def main(self):
        persons_file = self.persons_file()
        person_one = persons_file[0]
        person_two = persons_file[1]
        
        # check if files are latest
        p_one_check = self.check_files(person_one)[0]
        p_two_check = self.check_files(person_two)[0]
        
        current_file_names = self.create_file_names_with_dates()
        person_one_download = person_two_download = regimes_download = ''
        
        if any([p_one_check, p_two_check]):
            person_urls = self.get_person_urls()
            pub_date = person_urls[0]
            
            if self.enable_download(person_one, pub_date):
                person_one_download = self.download_file(person_urls[1][0][1], 
                                                         current_file_names[1])
            
            if self.enable_download(person_two, pub_date):
                person_two_download = self.download_file(person_urls[1][1][1], 
                                                         current_file_names[2])
                
            if ('person_two_download' or 'person_one_download') in locals():
                regimes_download = self.download_file(self.regimes_url, 
                                                      current_file_names[0])
                       
            
        # Use return to object later to identify downloaded items
        return regimes_download, person_one_download, person_two_download   

In [3]:
Extract().main()