In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  Script de presentacion a IE
#  El codigo fue desarrollado por Juan Luis Rivero de Innova-tsn
#

In [8]:
import sqlite3
import os
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from collections import defaultdict


In [9]:
def make_dir(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
        
def isNaN(x):
    return str(x) == str(1e400*0)

In [10]:
# Constantes
tables = ['media_types','genres','playlists','playlist_track','tracks','artists','invoices','invoice_items','albums','customers','employees']
DIR_SALIDA = './ingesta/'
DIR_TRANSFORMACION = './transformacion/'
FILE_TRANSFORMACION = './transformacion/canciones.csv.gz'

In [11]:
# Leer tablas, cargarlas en pandas DataFrame y sacarlas a ficheros csv
make_dir(DIR_SALIDA)
con = sqlite3.connect("chinook.db")

for tab in tables:
    query = 'SELECT * from ' + str(tab)
    df = pd.read_sql_query(query, con)
    file_out = DIR_SALIDA + str(tab) + '.csv.gz'
    df.to_csv(file_out, sep=';', encoding='utf-8',compression='gzip',index=False)

con.close()

In [12]:
############################################################################################################
# OBJETIVO 1: Leer bloques de 100 en 100 la tabla artist y dejarlo en ficheros csv para su posterior union #
############################################################################################################
con = sqlite3.connect("chinook.db")
c = con.cursor()
tabla = 'artists'
query = 'SELECT  count(*) from ' + str(tabla)
c.execute(query)
registros = c.fetchone()[0]

n = 0
for i in range(0,registros,100):
    
    query = 'SELECT * from ' + str(tabla) + ' LIMIT ' + str(i) + ',100'
    df = pd.read_sql_query(query, con)
    if i == 0:
        file_out = DIR_SALIDA + '00000.csv.gz'
        df.to_csv(file_out, sep=';', encoding='utf-8',compression='gzip',index=False)
    else:
        file_out = DIR_SALIDA + '00000_' + str(n) + '.csv.gz'
        df.to_csv(file_out, sep=';', encoding='utf-8',compression='gzip',index=False,header=False)
        
    n += 1
    
con.close()

In [13]:
# Transformacion
make_dir(DIR_TRANSFORMACION)

media_types = pd.read_csv(DIR_SALIDA + 'media_types.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'MediaTypeId':np.int})
tracks = pd.read_csv(DIR_SALIDA + 'tracks.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'TrackId':np.int,'AlbumId':np.int,'MediaTypeId':np.int})
albums = pd.read_csv(DIR_SALIDA + 'albums.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'AlbumId':np.int,'ArtistId':np.int})
artists = pd.read_csv(DIR_SALIDA + 'artists.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'ArtistId':np.int})
customers = pd.read_csv(DIR_SALIDA + 'customers.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'CustomerId':np.int,'SupportRepId':np.int})
invoices = pd.read_csv(DIR_SALIDA + 'invoices.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'CustomerId':np.int,'InvoiceId':np.int})
invoice_items = pd.read_csv(DIR_SALIDA + 'invoice_items.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'InvoideItemId':np.int,'InvoiceId':np.int,'TrackId':np.int,'UnitPrice':np.float,'Quantity':np.int})
genres =  pd.read_csv(DIR_SALIDA + 'genres.csv.gz', sep=';', encoding='utf-8', compression='gzip', dtype={'GenreId':np.int})


In [14]:
########################################################################################################################
# OBJETIVO 2                                                                                                           #
########################################################################################################################

#Album
tracks = pd.merge(tracks, albums, on='AlbumId', how='left')
tracks.rename(columns={'Title': 'AlbumTitle','Name': 'TrackName'}, inplace=True)
tracks['AlbumTitle'] = tracks['AlbumTitle'].apply(lambda x: x.upper().strip())
tracks.drop('AlbumId', axis=1, inplace=True)


In [15]:
# Interprete
tracks = pd.merge(tracks, artists, on='ArtistId', how='left')
tracks.rename(columns={'Name': 'ArtistName'}, inplace=True)
tracks['ArtistName'] = tracks['ArtistName'].apply(lambda x: x.title().strip())
tracks.drop('ArtistId', axis=1, inplace=True)

In [16]:
# Genero
tracks = pd.merge(tracks, genres, on='GenreId', how='left')
tracks.rename(columns={'Name': 'GenreName'}, inplace=True)
tracks.drop('GenreId', axis=1, inplace=True)

In [17]:
# Media type
tracks = pd.merge(tracks, media_types, on='MediaTypeId', how='left')
tracks.rename(columns={'Name': 'MediaName'}, inplace=True)
tracks.drop('MediaTypeId', axis=1, inplace=True)

In [18]:
canciones = defaultdict(list)

for invo in range(len(invoices)):
    ix_invoices = invoices.loc[invo,'CustomerId']
    nombre = customers[customers['CustomerId'] == ix_invoices]['FirstName'].values[0].upper()
    apellido = customers[customers['CustomerId'] == ix_invoices]['LastName'].values[0].upper()
    nombre_completo = nombre + ' ' + apellido
    empresa = customers[customers['CustomerId'] == ix_invoices]['Company'].values[0]
    if isNaN(empresa):
        empresa = 'N/A'
    else:
        empresa = str(empresa)
        
    invoice_ix = invoices[invoices['CustomerId'] == ix_invoices]['InvoiceId'].values[0]
    items = invoice_items[invoice_items['InvoiceId'] == invoice_ix]
    for i in range(len(items)):
        canciones[int(invoice_items.loc[i,'TrackId'])].append((nombre_completo,empresa,invoice_items.loc[i,'UnitPrice'],int(invoice_items.loc[i,'Quantity'])))

In [19]:
tracks['Users'] = None

for key in canciones.keys():
    #print(key)
    ix_track = tracks[tracks['TrackId']==key].index[0]
    #print (ix_track)
    tracks.set_value(ix_track,'Users', canciones[key])

tracks.to_csv(DIR_TRANSFORMACION + 'canciones.csv.gz',sep=';', encoding='utf-8', compression='gzip')