# Adquisición de datos para finanzas

## 1. Adquisición de datos a partir de ficheros

jupyter notebook

### 1.1 Ficheros separados por coma (CSV).

In [1]:
import pandas as pd

# Separador por defecto ','
invoices_df = pd.read_csv('../data/ecommerce.csv')
print(invoices_df.head())

  InvoiceNo StockCode                          Description  Quantity   
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER       6.0  \
1    536365     71053                  WHITE METAL LANTERN       6.0   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER       8.0   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE       6.0   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.       6.0   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom  


In [2]:
# Cuando el separador no es ',' hay que especificarlo. Puede ser ';', tabulación, '#' u otros
invoices_semicolon_sep_df = pd.read_csv('../data/ecommerce_semicolon_sep.csv')
print(invoices_semicolon_sep_df.head())

  InvoiceNo;StockCode;Description;Quantity;InvoiceDate;UnitPrice;CustomerID;Country
0  536365;85123A;WHITE HANGING HEART T-LIGHT HOLD...                               
1  536365;71053;WHITE METAL LANTERN;6;12/1/2010 8...                               
2  536365;84406B;CREAM CUPID HEARTS COAT HANGER;8...                               
3  536365;84029G;KNITTED UNION FLAG HOT WATER BOT...                               
4  536365;84029E;RED WOOLLY HOTTIE WHITE HEART.;6...                               


In [3]:
invoices_semicolon_sep_df = pd.read_csv('../data/ecommerce_semicolon_sep.csv', sep=';')
print(invoices_semicolon_sep_df.head())

  InvoiceNo StockCode                          Description  Quantity   
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6  \
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom  


### 1.3. Ficheros de Excel.


In [5]:
import pandas as pd

invoices_df = pd.read_excel('../data/ecommerce_excel.xlsx', sheet_name="2024")
print(invoices_df.head())

  InvoiceNo StockCode                          Description  Quantity   
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER       6.0  \
1    536365     71053                  WHITE METAL LANTERN       6.0   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER       8.0   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE       6.0   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.       6.0   

           InvoiceDate UnitPrice  CustomerID         Country  
0  2010-01-12 08:26:00      2.55     17850.0  United Kingdom  
1  2010-01-12 08:26:00      3.39     17850.0  United Kingdom  
2  2010-01-12 08:26:00      2.75     17850.0  United Kingdom  
3  2010-01-12 08:26:00      3.39     17850.0  United Kingdom  
4  2010-01-12 08:26:00      3.39     17850.0  United Kingdom  


### 1.4. Ficheros JSON.

### 1.5. Ficheros en formato parquet.

[Apache Parquet](https://parquet.apache.org/)
* ¿Qué es?
Apache Parquet es un formato de fichero columnar, de código abierto y diseñado para ser más eficiente en la lectura, almacenamiento y escritura de datos.



[Apache Arrow](https://arrow.apache.org/)


In [4]:
import pandas as pd

# You can specifiy an engine to direct the serialization.
# This can be one of pyarrow, or fastparquet, or auto.
# If the engine is NOT specified, then the pd.options.io.parquet.engine option is checked;
# if this is also auto, then then pyarrow is tried, and falling back to fastparquet

ecommerce_parquet_df = pd.read_parquet('../data/ecommerce.parquet', engine='pyarrow')
print(ecommerce_parquet_df.head())

  InvoiceNo StockCode                          Description  Quantity   
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6  \
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom  


## 2. Adquisición de datos a través de APIs.

In [5]:
import requests
import pandas as pd
from pandas import json_normalize


url = "https://real-time-product-search.p.rapidapi.com/search"

querystring = {"q":"Nike shoes","country":"us","language":"en","limit":"30"}

headers = {
	"X-RapidAPI-Key": "be814bcabbmshc4f57ebcf4b7568p1eb15djsn52335224755f",
	"X-RapidAPI-Host": "real-time-product-search.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring).json()
print(response)

{'status': 'OK', 'request_id': '0fea6d82-d9cc-4cd1-81aa-57e662af3546', 'data': [{'product_id': '1895888000104236047', 'product_id_v2': '1895888000104236047:17750431743876774496', 'product_title': 'Nike PS Dunk Low - White / Black 11.5C', 'product_description': "The Nike Dunk Low Retro White Black (PS) sneakers combine iconic style with modern comfort. With its timeless white and black colorway, these sneakers are versatile and perfect for any occasion. The retro design pays homage to the original Nike Dunk, while the low-top silhouette offers a contemporary vibe. Crafted with premium materials, these sneakers provide durability and support. Whether you're hitting the skate park or strolling the streets, the Nike Dunk Low Retro White Black (PS) sneakers will elevate your footwear game.", 'product_photos': ['https://encrypted-tbn3.gstatic.com/shopping?q=tbn:ANd9GcT8NrfBUVX148PzI8YgUOGCPdUPTIjt4TiSUpHPd6PzV6AL_V-JKoSz3yHS6V713OBSr1iXSgIRR2ZIyKfjjCS5LcZG3PwdSA&usqp=CAE', 'https://encrypted

In [6]:
data_dict = response["data"]
print(data_dict)

[{'product_id': '1895888000104236047', 'product_id_v2': '1895888000104236047:17750431743876774496', 'product_title': 'Nike PS Dunk Low - White / Black 11.5C', 'product_description': "The Nike Dunk Low Retro White Black (PS) sneakers combine iconic style with modern comfort. With its timeless white and black colorway, these sneakers are versatile and perfect for any occasion. The retro design pays homage to the original Nike Dunk, while the low-top silhouette offers a contemporary vibe. Crafted with premium materials, these sneakers provide durability and support. Whether you're hitting the skate park or strolling the streets, the Nike Dunk Low Retro White Black (PS) sneakers will elevate your footwear game.", 'product_photos': ['https://encrypted-tbn3.gstatic.com/shopping?q=tbn:ANd9GcT8NrfBUVX148PzI8YgUOGCPdUPTIjt4TiSUpHPd6PzV6AL_V-JKoSz3yHS6V713OBSr1iXSgIRR2ZIyKfjjCS5LcZG3PwdSA&usqp=CAE', 'https://encrypted-tbn3.gstatic.com/shopping?q=tbn:ANd9GcQZZn2-w-DfKNBSSrtVwvSPeklf-JVDRBlttQ52m3

In [7]:
selected_cols = [
    'product_id',
    'product_title',
    'product_rating',
    'typical_price_range',
    'offer'
]
data_df = pd.DataFrame(data_dict)[selected_cols]
print(data_df.head())

             product_id                                      product_title   
0   1895888000104236047             Nike PS Dunk Low - White / Black 11.5C  \
1  16474445837437288542  Nike Grade School Court Borough Low Recraft Wh...   
2   2060730182710679218  Nike Court Vision Low Next Nature White/Pink W...   
3    556418746386863519               Air Jordan 1 Mid White/Gym Red-Black   
4  15793769793121642732                                 Jordan 1 Mid SE PS   

   product_rating typical_price_range   
0             4.4    [$70.00, $84.00]  \
1             4.7    [$67.00, $67.00]   
2             4.6    [$74.99, $88.00]   
3             4.7         [$75, $129]   
4             4.8    [$80.00, $80.00]   

                                               offer  
0  {'store_name': 'Nike', 'store_rating': 4.5, 'o...  
1  {'store_name': 'Famous Footwear', 'store_ratin...  
2  {'store_name': 'ShopWSS', 'store_rating': None...  
3  {'store_name': 'StockX', 'store_rating': 4.1, ...  
4  {'stor

In [8]:
# Aplanar el diccionario dentro de la columna 'datos'
df_aplanado = json_normalize(data_df['offer'])

# Concatenar el DataFrame aplanado con el DataFrame original
df_resultante = pd.concat([data_df, df_aplanado], axis=1)

cols_to_drop = [
    'offer',
    'offer_page_url',
    'store_reviews_page_url',
    'original_price',
    'product_condition',
    'buy_now_url',
    'on_sale',
    'shipping'
]
df_resultante = df_resultante.drop(columns=cols_to_drop, axis=1)
print(df_resultante.head())

             product_id                                      product_title   
0   1895888000104236047             Nike PS Dunk Low - White / Black 11.5C  \
1  16474445837437288542  Nike Grade School Court Borough Low Recraft Wh...   
2   2060730182710679218  Nike Court Vision Low Next Nature White/Pink W...   
3    556418746386863519               Air Jordan 1 Mid White/Gym Red-Black   
4  15793769793121642732                                 Jordan 1 Mid SE PS   

   product_rating typical_price_range        store_name  store_rating   
0             4.4    [$70.00, $84.00]              Nike           4.5  \
1             4.7    [$67.00, $67.00]   Famous Footwear           4.4   
2             4.6    [$74.99, $88.00]           ShopWSS           NaN   
3             4.7         [$75, $129]            StockX           4.1   
4             4.8    [$80.00, $80.00]  Kids Foot Locker           4.4   

   store_review_count   price              tax  
0                1039  $70.00  +$8.00 est. 

## 3. Adquisición de datos a través de conexiones a bases de datos (BBDD).

### 3.1 Bases de datos relacionales (SQL).

Se ha generado una base de datos PostgreSQL en https://console.neon.tech/app/projects de forma gratuita para este caso. Se han insertado 18 registros del CSV de ecommerce trabajado previamente.

In [9]:
import pandas as pd
from sqlalchemy import create_engine, URL

url_object = URL.create(
    "postgresql",
    username="ismaelcazalilla",
    password="l10EaBKMzjJU",
    host="ep-throbbing-haze-36918596.eu-central-1.aws.neon.tech",
    database="adquisicion_datos",
)

# Generamos una instancia de motor de conexión a la base de datos
db_engine = create_engine(url_object)
db_engine.connect()

: 

In [None]:
# Conectamos con la base de datos y lanzamos una query para leer los datos
with db_engine.connect() as conn, conn.begin():  
    df = pd.read_sql_query("SELECT * FROM adquisicion.ecommerce", con=db_engine)
    print(df.head())

  invoiceno stockcode                          description  quantity   
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6  \
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          invoicedate  unitprice customerid         country  
0 2010-12-01 08:26:00       2.55      17850  United Kingdom  
1 2010-12-01 08:26:00       3.39      17850  United Kingdom  
2 2010-12-01 08:26:00       2.75      17850  United Kingdom  
3 2010-12-01 08:26:00       3.39      17850  United Kingdom  
4 2010-12-01 08:26:00       3.39      17850  United Kingdom  
