<a href="https://colab.research.google.com/github/jrodrigu12/SteelEyeAssessment/blob/main/SteelEye_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import requests
import xml.etree.ElementTree as ET
import zipfile
import io
import csv
from io import StringIO
import pandas as pd  # For creating CSV file
import boto3

# URL for downloading the XML file
xml_url = 'https://registers.esma.europa.eu/solr/esma_registers_firds_files/select?q=*&fq=publication_date:%5B2021-01-17T00:00:00Z+TO+2021-01-19T23:59:59Z%5D&wt=xml&indent=true&start=0&rows=100'

# Download the XML file from the provided link
response = requests.get(xml_url)

# Parse the XML response
root = ET.fromstring(response.content)

# Find the second download link with `file_type` as "DLTINS"
download_link = None
count = 0

for doc in root.findall(".//doc"):
    file_type = None
    download_link = None
    for field in doc.findall(".//str"):
        if field.attrib['name'] == 'file_type' and field.text == 'DLTINS':
            count += 1
            if count == 2:  # this instruction point to the second 'DLTINS' link
                for str_element in doc.findall(".//str"):
                    if str_element.attrib['name'] == 'download_link':
                        download_link = str_element.text
                        break
    if download_link:
        break

if not download_link:
    print("Download link for the second DLTINS file not found.")
    exit()

print(f"Download link found: {download_link}")

# Download the zip file from the found link
zip_response = requests.get(download_link)
zip_file = zipfile.ZipFile(io.BytesIO(zip_response.content))

# Extract the XML from the zip file
xml_filename = None
for filename in zip_file.namelist():
    if filename.endswith(".xml"):
        xml_filename = filename
        zip_file.extract(filename)
        print(f"Extracted file: {filename}")
        break

if not xml_filename:
    print("No XML file found in the zip archive.")
    exit()

def xml_to_csv(xml_filename):
    """Parse XML and convert it to CSV."""
    tree = ET.parse(xml_filename)
    root = tree.getroot()

    # Open a CSV file to write the data
    with open('output.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # Write the header
        writer.writerow(['FinInstrmGnlAttrbts.Id', 'FinInstrmGnlAttrbts.FullNm',
                         'FinInstrmGnlAttrbts.ClssfctnTp', 'FinInstrmGnlAttrbts.CmmdtyDerivInd',
                         'FinInstrmGnlAttrbts.NtnlCcy', 'Issr'])

        # Loop through each XML element
        for fin_instrm in root.findall('FinInstrmGnlAttrbts'):
            id_ = fin_instrm.find('Id').text
            full_name = fin_instrm.find('FullNm').text
            clssfctn_tp = fin_instrm.find('ClssfctnTp').text
            cmmdty_deriv_ind = fin_instrm.find('CmmdtyDerivInd').text
            ntnl_ccy = fin_instrm.find('NtnlCcy').text
            issuer = fin_instrm.find('Issr').text

            # Write each row to the CSV
            writer.writerow([id_, full_name, clssfctn_tp, cmmdty_deriv_ind, ntnl_ccy, issuer])

# Convert XML to CSV
xml_to_csv(xml_filename)

# Load the generated CSV into a DataFrame for further processing
df = pd.read_csv('output.csv')

# Add columns based on conditions descibed on requirement 5 and 6
df['a_count'] = df['FinInstrmGnlAttrbts.FullNm'].str.count('a')
df['contains_a'] = df['a_count'].apply(lambda x: "YES" if x > 0 else "NO")

# Load new columns to DataFrame
df.to_csv('output.csv', index=False)


#Requirement 7 ->  this section below doesn't work due to not having created an AWS S3 bucket

# Initialize S3 client
s3 = boto3.client('s3')

# Convert DataFrame to CSV format and then to an in-memory string
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)

# Define bucket name and file name
bucket_test = 's3-bucket-name'
file_output = 'AWSoutput.csv'

# Upload the CSV to S3 -> The instruction below is commented on due to not having created an AWS S3 bucket
#s3.put_object(Bucket=bucket_test, Key=file_output, Body=csv_buffer.getvalue())


# Print final DataFrame
print(df)


Download link found: https://firds.esma.europa.eu/firds/DLTINS_20210119_01of02.zip
Extracted file: DLTINS_20210119_01of02.xml
Empty DataFrame
Columns: [FinInstrmGnlAttrbts.Id, FinInstrmGnlAttrbts.FullNm, FinInstrmGnlAttrbts.ClssfctnTp, FinInstrmGnlAttrbts.CmmdtyDerivInd, FinInstrmGnlAttrbts.NtnlCcy, Issr, a_count, contains_a]
Index: []


In [None]:
import pandas as pd  # For creating CSV file
from xml.etree import ElementTree as ET  # To parse XML
from pathlib import Path
import csv

# Create a path object for XML and CSV files
xml_path = Path("/content/test2.xml")
csv_path = Path("/testOutput.csv")



def xml_to_csv(xml_path, csv_path):
    """Parse XML and convert it to CSV."""
    # Parse the XML file
    xml_parse = ET.parse(xml_path)
    root = xml_parse.getroot()

    # Open CSV file for writing
    with open(csv_path, mode='w', newline='') as file:
        writer = csv.writer(file)

        # Write header (based on test2 XML structure)
        writer.writerow(['Name', 'Position','Club'])

        # Loop through each XML element
        for player in root.findall('player'):
            name = player.find('name').text
            position = player.find('position').text
            team = player.find('club').text

            # Write each row to the CSV
            writer.writerow([name, position, team])


# Convert XML to CSV
xml_to_csv(xml_path, csv_path)

# Load CSV to DataFrame for additional processing
df = pd.read_csv(csv_path)

# Add new columns based on conditions
df['A_count'] = df['Name'].str.count('a')
df['contains_A'] = df['A_count'].apply(lambda x: "YES" if x > 0 else "NO")

#Load new columns to DataFrame
df.to_csv(csv_path, index=False)



# Print final DataFrame
print(df)
