In [22]:
import pandas as pd
import csv
import requests
import xml.etree.ElementTree as ET
import zipfile
import boto3
from io import StringIO
import logging  #module for making a log 
import zipfile

In [23]:
class Lambda:
    '''
    The Lambda Object has the one parameter:
    
    :param url: The url to parse through the xml and extract the first download link whose file_type is DLTINS
    
    Has three methods:
    
    :method download_link: Requests the url to get the xml filetthan parse the Xml file to return the download_link
                           containing only DLTINS file_type.
    :method zip_extraction: Downloads the zip file and extract the xml file in zip file
    :method xml_to_csv: Parse the xml file and Converts it into csv
    '''
    
    
    def __init__(self, url = None) -> None:
        self.url = url
        
    def download_link(self):
        '''
        Uses the url of the class to get the required path.
        
        Creates a file 'registers.xml' in binary mode and write the path data to the xml file
        Parse the xml file and find the required node and return the download link
        
        '''
        self.resp = requests.get(self.url)
        with open('registers.xml', 'wb') as f:
            f.write(self.resp.content)
        self.tree = ET.parse('registers.xml')
        self.root = self.tree.getroot()
        
        self.link = ''
        for item in self.root[1].iter("doc"):
            if item.find("str[@name = 'file_type']").text == 'DLTINS':
                self.link = item.find("str[@name='download_link']").text
                break
        return self.link
    
    def zip_extraction(self, link = None):
        '''
        :param link: url link to download the zip file
        
        Uses the link to request the link
        Create a file 'zip_file.zip' and write the content into the file
        Extract the zip file and save the name of the file from the namelist and return it
        '''
        self.zip_file = requests.get(self.link)
        with open('zip_file.zip', 'wb') as f:
            f.write(self.zip_file.content)
        self.xml_file = ''
        with zipfile.ZipFile('zip_file.zip', 'r') as f:
            self.xml_file = f.namelist()[0]
            f.extractall('')
        return self.xml_file
    
    def xml_to_csv(self, xml = None):
        '''
        :param xml: xml file which is to be converted to csv
        
        Parse the xml file to find the required tags according to the following headers
        {FinInstrmGnlAttrbts.Id,
        FinInstrmGnlAttrbts.FullNm,
        FinInstrmGnlAttrbts.ClssfctnTp,
        FinInstrmGnlAttrbts.CmmdtyDerivInd,
        FinInstrmGnlAttrbts.NtnlCcy,
        Issr}
        
        Creates a DataFrame with the above headers and returns it.
        
        '''
        
        self.new = ET.parse(xml)     #parse xml
        self.test = self.new.getroot()

        self.pattern = 'FinInstrmGnlAttrbts'     #required node
        self.children = ['Id', 'FullNm', 'ClssfctnTp', 'CmmdtyDerivInd', 'NtnlCcy']     #required children nodes
        self.tag = 'Issr' #required node

        self.rows  = []
        self.cols = [self.pattern + '.' + k for k in self.children]
        self.cols.append(self.tag)
        
        self.parent = 'TermntdRcrd'        #parent node
        
        for i in self.test.iter():         
            if self.parent in i.tag:       # If parent is found
                self.entry = [None for x in range(len(self.cols))]     # Initialise array of required elements
                for child in i:
                    if self.pattern in child.tag:    # If required child has been found
                            for c in child:     # Get the required grand-children
                                for k in range(len(self.children)):
                                        if self.children[k] in c.tag:    # If grandchildren found, update entry
                                            self.entry[k] = c.text
                    if self.tag in child.tag:     # If Issr found
                        self.entry[5] = child.text
                self.rows.append(self.entry)      # Add to list of rows
                
                
        self.df = pd.DataFrame(self.rows, columns=self.cols)      
        return self.df

In [24]:
if __name__ == '__main__':
    
    #Requirement-1: save the download link to url and download the xml file
    url = "https://registers.esma.europa.eu/solr/esma_registers_firds_files/select?q=*&fq=publication_date:%5B2021-01-17T00:00:00Z+TO+2021-01-19T23:59:59Z%5D&wt=xml&indent=true&start=0&rows=100"
    p = Lambda(url) #create an object for class lambda 
    
    #Requirement 2: From the xml, please parse through to the first download link whose file_type is DLTINS and download the zip
    zip_link = p.download_link()
    
    #Requirement 3: Extract the xml from the zip.
    xml_file = p.zip_extraction(zip_link)
    
    #Requirement 4: Convert the contents of the xml into a CSV
    df = p.xml_to_csv(xml_file)
    df.to_csv('Likhiloutput.csv')
    
    #Requirement 5: Store the csv from step 4) in an AWS S3 bucket
    # aws_secret_access_key for my account is hidden I am changing it after uploading my csv file on S3 bucket 
    s3 = boto3.client("s3", aws_access_key_id = "AKIA3BJK6LUBWUA232H7", aws_secret_access_key="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")
    csv_buf = StringIO()
    df.to_csv(csv_buf, header = True, index = False)
    csv_buf.seek(0)
    s3.put_object(Bucket="likhilbucketgottumukkala", Body=csv_buf.getvalue(), Key='likhiloutput.csv')