## Extract Phase.
#### Donwloads from datasource tar.gz files containing geographical coordinates, saves and extracts on a destination folder.
The `extract_from_source()` is a python module that receives a url containing a `tar.gz` datasource, a destination directory and a flag to exctract the content.
The output is files extracted at destination directory.

In [None]:
from extract.extract_targz_from_source import extract_from_source

source = "https://s3.amazonaws.com/dev.etl.python/datasets/data_points.tar.gz"
destination_path = "/app/data_from_source"
extract = True

extract_from_source(source, destination_path, extract)


# Transform Phase

#### 1 - Reads a directory containing coordinates/points data files.

In [None]:
from transform.transform_raw_to_csv import get_data_files

files_path = "/app/data_from_source"
files = get_data_files("/app/data_from_source")


#### 2 - Prepares for cleaning, transforming file content to `List[List[str()]]`

In [None]:
from transform.transform_raw_to_csv import wrangle_points_to_list

data_files = []
for file in files:
    data_files.append(f"{files_path}/{file}")
points_list = wrangle_points_to_list(data_files)
print(points_list)


#### 3 - Apply data detection using regexp and store in a`list[dict{}]`

In [None]:
from transform.transform_raw_to_csv import convert_data_coordinates

detected_points = convert_data_coordinates(points_list)

print(detected_points)


#### 4 - Remove duplicated dictionaries inside `detected_points` list.

In [None]:
from transform.transform_raw_to_csv import  remove_duplicates

print(f"Before deduplication {len(detected_points)}")

deduplicated_points = remove_duplicates(detected_points)

print(f"After deduplication {len(deduplicated_points)}")
print(deduplicated_points)


#### 5 - Converts the `List[dict{}]` of deduplicated points in a csv file and saves at the disk.

In [None]:
from transform.transform_raw_to_csv import write_points_to_csv

path_to_csv = "/app/normalized_data/data.csv"

write_points_to_csv(deduplicated_points, path_to_csv)

with open(path_to_csv, "r") as csv_file:
    [print(line) for line in csv_file]


#### 6 - Reads data from CSV file, converts and returns a Dataframe containing the values.

In [None]:
from transform.transform_csv_to_database import Converter

converter = Converter(api_key="***REMOVED***")
dataset_from_csv = converter.get_coordinates_from_csv_file(path_to_csv)

print(dataset_from_csv)

#### 7 - Makes the API calls to retrieve data from latitude/longitude points and saves to database.

In [None]:
converter.save_dataset_coordinates_to_database(dataset_from_csv)


## Load Phase
#### 1 - Reads data from Database and displays in current Cell.


In [None]:
import dataset
from decouple import config
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
import pandas as pd

InteractiveShell.ast_node_interactivity = "all"

db_user = config("POSTGRES_USER")
db_name = config("POSTGRES_DB")
db_password = config("POSTGRES_PASSWORD")
db_host = config("POSTGRES_HOST")
string_connection = (f"postgresql://{db_user}:{db_password}@{db_host}:5432/{db_name}")

db = dataset.connect(string_connection)

with db.engine.connect() as conn, conn.begin():
    data = pd.read_sql("addresses", conn)
    display(HTML(data.to_html()))

