# Práctica ETL - Mapeado de datos LAV

- **Master in Data Science - UC, UIMP, CSIC.**
- **Subject:** Data Life Cycle.
- **Author:** Ignacio Iker Prado Rujas.

## Generación y preparación de las fuentes de datos

Cargamos las librerías relevantes:

In [21]:
import sqlite3
import json
from faker import Faker

Creamos un Faker para cada una de las fuentes, de acuerdo a su país de procedencia:

In [24]:
fake_UK = Faker('en_UK')
fake_ES = Faker('es_ES')
#fake_US = Faker('en_US')

### Librería británica

Generamos datos para la librería británica:

In [177]:
!rm bookstore_UK.db

In [178]:
conn = sqlite3.connect('bookstore_UK.db')
c = conn.cursor()

# Creamos la tabla Book
c.execute('''CREATE TABLE Book
             (idBook integer primary key,
              title text,
              authors text, 
              publisher text, 
              year integer,
              price text,
              ISBN text)''')

# Generamos valores con Faker
fake_vals = []
for i in range(100):
    idBook = i
    title = fake_UK.catch_phrase()
    authors = fake_UK.name()
    publisher = fake_UK.company()
    year = fake_UK.year()
    price = '£' + str(fake_UK.random_int(min=1, max=50))
    ISBN = fake_UK.isbn13()
    fake_vals.append([idBook, title, authors, publisher, year, price, ISBN])

# Los introducimos en la base de datos
c.executemany('''INSERT INTO Book (idBook, title, authors, publisher, year, price, ISBN)
                 VALUES (?, ?, ?, ?, ?, ?, ?)''', fake_vals)

# Miramos los primeros valores insertados
c.execute('SELECT * FROM Book where idBook < 3;')
for row in c.fetchall():
    print(row)

conn.commit()
conn.close()

(0, 'Ergonomic composite capacity', 'Billy Sheppard', 'Davies, Kelly and Ford', 1988, '£4', '978-0-919598-88-1')
(1, 'Integrated disintermediate toolset', 'Dr. Stacey Parker', 'Alexander, Smith and Fox', 2007, '£17', '978-1-69421-289-4')
(2, 'Reactive asynchronous projection', 'Christian James', 'Wood Inc', 1999, '£25', '978-1-971405-35-3')


### Librería española

Ahora para la librería española:

In [344]:
!rm bookstore_ES.db

In [345]:
conn = sqlite3.connect('bookstore_ES.db')
c = conn.cursor()

# Creamos la tabla Persona
c.execute('''CREATE TABLE Persona
             (idPersona integer primary key,
              Nombre text,
              Apellido1 text, 
              Apellido2 text, 
              FechaNacimiento date)''')

# Creamos la tabla Publicacion
c.execute('''CREATE TABLE Publicacion
             (idPublicacion integer primary key,
              Titulo text,
              Editorial text, 
              Lengua text, 
              FechaPublicacion date,
              Precio float)''')

# Creamos la tabla Persona_has_Pubicacion
c.execute('''CREATE TABLE Persona_has_Publicacion
             (Persona_idPersona integer,
              Publicacion_idPublicacion integer,
              Rol_idRol integer, 
              primary key(Persona_idPersona, 
                          Publicacion_idPublicacion))''')

# Creamos la tabla Rol
c.execute('''CREATE TABLE Rol
             (idRol integer primary key,
              rol text)''')

# Generamos valores con Faker

# Para Persona
n = 200
fake_vals = []
for i in range(n):
    idPersona = i
    Nombre = fake_ES.name().split()[0]
    Apellido1 = fake_ES.name().split()[1]
    aux = fake_ES.name().split()[1]
    Apellido2 = fake_ES.name().split()[1] if aux in ['el', 'del', 'de'] else aux
    FechaNacimiento = str(fake_ES.date_between(end_date='today', start_date='-150y'))
    fake_vals.append([idPersona, Nombre, Apellido1, Apellido2, FechaNacimiento])
    
c.executemany('''INSERT INTO Persona (idPersona, Nombre, Apellido1, Apellido2, FechaNacimiento)
                 VALUES (?, ?, ?, ?, ?)''', fake_vals)

# Para Publicacion

fake_vals = []
m = 300
for i in range(m):
    idPublicacion = i
    Titulo = fake_ES.catch_phrase()
    Editorial = fake_ES.company()
    Lengua = fake_ES.language_code()
    FechaPublicacion = str(fake_ES.date_between(end_date='today', start_date='-120y'))
    Precio = str(fake_UK.random_int(min=1, max=50)) + '€'
    fake_vals.append([idPublicacion, Titulo, Editorial, Lengua, FechaPublicacion, Precio])
    
c.executemany('''INSERT INTO Publicacion (idPublicacion, Titulo, Editorial, Lengua, FechaPublicacion, Precio)
                 VALUES (?, ?, ?, ?, ?, ?)''', fake_vals)

# Para Rol
r = 30
fake_vals = []
for i in range(r):
    idRol = i
    Rol = fake_ES.word()
    fake_vals.append([idRol, Rol])
    
c.executemany('''INSERT INTO Rol (idRol, Rol)
                 VALUES (?, ?)''', fake_vals)

# Para Persona_has_Publicacion
fake_vals = []
set_aux = set()
for i in range(500):
    Persona_idPersona = fake_ES.random_int(min=1, max=n)
    Publicacion_idPublicacion = fake_ES.random_int(min=1, max=m)
    while (Persona_idPersona, Publicacion_idPublicacion) in set_aux:
        Persona_idPersona = fake_ES.random_int(min=1, max=n)
        Publicacion_idPublicacion = fake_ES.random_int(min=1, max=m)
    else:
        set_aux.add((Persona_idPersona, Publicacion_idPublicacion))
    Rol_idRol = fake_ES.random_int(min=1, max=r)
    fake_vals.append([Persona_idPersona, Publicacion_idPublicacion, Rol_idRol])
    
c.executemany('''INSERT INTO Persona_has_Publicacion (Persona_idPersona, Publicacion_idPublicacion, Rol_idRol)
                 VALUES (?, ?, ?)''', fake_vals)

# Miramos los primeros valores insertados
c.execute('SELECT * FROM Persona where idPersona < 3;')
print('Personas:')
for row in c.fetchall():
    print(row)
c.execute('SELECT * FROM Publicacion where idPublicacion < 3;')
print('Publicaciones:')
for row in c.fetchall():
    print(row)
c.execute('SELECT * FROM Rol where idRol < 3;')
print('Roles:')
for row in c.fetchall():
    print(row)
c.execute('SELECT * FROM Persona_has_Publicacion where Publicacion_idPublicacion in (3, 8);')
print('Publicaciones de personas:')
for row in c.fetchall():
    print(row)

conn.commit()
conn.close()

Personas:
(0, 'Trinidad', 'Miguel', 'Maria', '1947-03-08')
(1, 'Cristian', 'del', 'Hurtado-Cadenas', '1893-05-26')
(2, 'Jose', 'Silva-Prat', 'Campillo', '1895-10-23')
Publicaciones:
(0, 'Versatile zero-defect function', 'Checa-Barón', 'ta', '1986-12-25', '5€')
(1, 'Reverse-engineered regional conglomeration', 'Gibert, Serrano and Rojas', 'bem', '2009-09-18', '11€')
(2, 'Automated tertiary intranet', 'Paz PLC', 'id', '1951-12-29', '24€')
Roles:
(0, 'consequuntur')
(1, 'cumque')
(2, 'natus')
Publicaciones de personas:
(132, 8, 25)
(159, 3, 11)


### Librería americana

Cargamos los datos de la librería americana:  
**Nota:** Para poder parsear el fichero de manera adecuada, fue necesario introducir comas entre cada elemento del `json`.

In [352]:
with open('USA_books.json', encoding='utf-8') as f:
    data = json.load(f)
eval(str(data[0]))

{'index': 0,
 'title': 'National kind probably later across against require can.',
 'last_name': 'Hall',
 'first_name': 'Debbie',
 'publisher': 'Stewart Group',
 'ISBN': '978-0-88280-857-4',
 'summary': 'Exist change affect still consumer professional win audience. Responsibility generation picture.\nStop team recently administration onto. Oil do their choice story work.\nMost morning moment. Tell price grow decision technology one. Become people discussion machine than.\nAllow fill direction safe physical main life. Hand town talk enter.\nOut fast whether during simply. Option next performance sea PM.\nParticularly according save blue. Road expect country before season sea.'}

## Creación de la base de datos global

In [401]:
!rm bookstore.db

In [402]:
conn = sqlite3.connect('bookstore.db')
c = conn.cursor()

# Creamos la tabla Autor
c.execute('''CREATE TABLE Autor
             (idAutor integer primary key,
              nombre text)''')

# Creamos la tabla Libro
c.execute('''CREATE TABLE Libro
             (idLibro integer primary key,
              titulo text,
              editorial text, 
              fecha date,
              isbn)''')

# Creamos la tabla Autor_has_Libro
c.execute('''CREATE TABLE Autor_has_Libro
             (Autor_idAutor integer,
              Libro_idLibro integer,
              primary key(Autor_idAutor, 
                          Libro_idLibro))''')

conn.commit()
conn.close()

## Proceso ETL

In [403]:
def process_UK():
    return

conn_UK = sqlite3.connect('bookstore_UK.db')
c_UK = conn_UK.cursor()
conn = sqlite3.connect('bookstore.db')
c = conn.cursor()

idAutor = c.execute('select max(idAutor) from Autor').fetchall()[0][0]
idAutor = 0 if idAutor is None else idAutor
idLibro = c.execute('select max(idLibro) from Libro').fetchall()[0][0]
idLibro = 0 if idLibro is None else idLibro

c_UK.execute('SELECT * from Book;')
for book in c_UK.fetchall():
    # Extraemos la información útil
    _, title, authors, publisher, year, _, ISBN = book
    # Y la guardamos
    c.execute('INSERT INTO Autor (idAutor, nombre) VALUES (?, ?)', 
              (idAutor, authors))
    idAutor += 1
    c.execute('INSERT INTO Libro (idLibro, titulo, editorial, fecha, isbn) VALUES (?, ?, ?, ?, ?)',
               (idLibro, title, publisher, str(year)+'-01-01', ISBN))
    idLibro += 1
    c.execute('INSERT INTO Autor_has_Libro (Autor_idAutor, Libro_idLibro) VALUES (?, ?)',
               (idAutor, idLibro))

conn.commit()
conn_UK.close()
conn.close()

In [406]:
conn = sqlite3.connect('bookstore.db')
c = conn.cursor()
c.execute('SELECT * FROM Libro where idLibro < 10;')
for row in c.fetchall():
    print(row)
conn.close()

(0, 'Ergonomic composite capacity', 'Davies, Kelly and Ford', '1988-01-01', '978-0-919598-88-1')
(1, 'Integrated disintermediate toolset', 'Alexander, Smith and Fox', '2007-01-01', '978-1-69421-289-4')
(2, 'Reactive asynchronous projection', 'Wood Inc', '1999-01-01', '978-1-971405-35-3')
(3, 'Operative disintermediate structure', 'Welch, Webster and Wood', '2011-01-01', '978-0-656-23382-3')
(4, 'Stand-alone system-worthy capacity', 'Young, Patel and Jones', '2009-01-01', '978-1-4373-7326-4')
(5, 'Devolved secondary core', 'Moss, Taylor and Taylor', '1970-01-01', '978-1-4486-3021-9')
(6, 'Balanced static policy', 'Stokes LLC', '1985-01-01', '978-1-128-80316-2')
(7, 'Robust well-modulated moratorium', 'Wallace PLC', '1982-01-01', '978-1-991189-37-0')
(8, 'Switchable local task-force', 'Oliver, Haynes and Smith', '2012-01-01', '978-0-410-97830-4')
(9, 'Networked content-based productivity', 'Burton, Morgan and Read', '1970-01-01', '978-1-04-185948-2')
