In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import csv
import random
import pandas as pd
from tqdm import tqdm
import datetime
import logging
import re
import configparser
import psycopg2
import socket
import sys

In [None]:
config = configparser.ConfigParser()
config.read('../config.ini')

# Get the database credentials
db_endpoint = config['database']['host']
db_name = config['database']['database_name']
db_user = config['database']['username']
db_password = config['database']['password']
db_port = int(config['database']['port'])

# Set the timeout for the connection attempt (in seconds)
connection_timeout = 10

try:    
    # Now establish the database connection using psycopg2
    connection = psycopg2.connect(
        host=db_endpoint,
        port = db_port,
        database = db_name,
        user=db_user,
        password=db_password
    )
    print("Connected successfully!")
    

except (socket.timeout, psycopg2.OperationalError) as e:
    if isinstance(e, socket.timeout):
        print("Error: Connection timed out.")
    else:
        print("Error during connection:", e)
    sys.exit(1)  # Terminate the program with a non-zero exit code

## Eda para Trading Historic

In [None]:
with connection.cursor() as cursor:
    cursor.execute("SELECT * FROM tbTradingHistoric;")
    columns = [desc[0] for desc in cursor.description]
    data = cursor.fetchall()
    df_trading_historic = pd.DataFrame(data, columns=columns)

In [None]:
df_trading_historic.head()

In [None]:
df_trading_historic.columns

In [None]:
# Resumen estadístico básico
print(df_trading_historic.describe())

In [None]:
# Información general del DataFrame
print(df_trading_historic.info())

In [None]:
# Contar valores nulos en cada columna
print(df_trading_historic.isnull().sum())

In [None]:
#Find the duplicates

duplicate_regist = df_trading_historic.duplicated().sum()
print(f"Cantidad de registros duplicados: {duplicate_regist}")

In [None]:
#Count the unique symbols in the data
unique_symbols_count = df_trading_historic['Symbol'].nunique()
print(f"Cantidad de símbolos únicos: {unique_symbols_count}")

In [None]:
plt.boxplot(df_trading_historic[df_trading_historic['Symbol']=='A']['Open'])

In [None]:
df_trading_historic.head()

In [None]:
# Step 1: Convert 'Date' column to datetime type
df_trading_historic['Date'] = pd.to_datetime(df_trading_historic['Date'])

# Step 2: Drop rows with missing values in the 'Close' column
df_trading_historic.dropna(subset=['Close'], inplace=True)

# Step 3: Group by symbol and calculate the percentage growth between the first and last closing prices
symbol_growth = df_trading_historic.groupby('Symbol').agg(
    first_close=('Close', 'first'),
    last_close=('Close', 'last'),
    percentage_growth=('Close', lambda x: (x.iloc[-1] - x.iloc[0]) / x.iloc[0] * 100)
).reset_index()

# Step 4: Create a new DataFrame with the calculated percentage growth and closing prices
df_symbol_growth = pd.DataFrame(symbol_growth, columns=['Symbol', 'first_close', 'last_close', 'percentage_growth'])

# Step 5: Sort the new DataFrame in descending order based on the percentage growth
df_symbol_growth = df_symbol_growth.sort_values(by='percentage_growth', ascending=False)

# Print the result
print(df_symbol_growth)

In [None]:
SELECT column_name
FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_name = 'tbtradinghistoric';

## Eda para tickers

In [None]:
with connection.cursor() as cursor:
    cursor.execute("SELECT * FROM tbtickers;")
    columns = [desc[0] for desc in cursor.description]
    data = cursor.fetchall()
    df_tickers = pd.DataFrame(data, columns=columns)

In [None]:
df_tickers.head()

In [None]:
df_tickers.columns

In [None]:
df_tickers.describe()

In [None]:
df_tickers.info()

In [None]:
# Contar valores nulos en cada columna
print(df_tickers.isnull().sum())

In [None]:
#Count the unique symbols in the data
unique_tickers_count = df_tickers['Symbol'].nunique()
print(f"Cantidad de símbolos únicos: {unique_tickers_count}")

In [None]:
from collections import Counter
symbol_column = df_tickers['Symbol']
symbol_count = Counter(symbol_column)
repeated_symbols = {symbol: count for symbol, count in symbol_count.items() if count > 1}
print(f'Cntidad de Symbol repetidos: {repeated_symbols}')

## EDA Para GIDS Directory

In [None]:
with connection.cursor() as cursor:
    cursor.execute("SELECT * FROM tbgidsdirectory;")
    columns = [desc[0] for desc in cursor.description]
    data = cursor.fetchall()
    df_gids = pd.DataFrame(data, columns=columns)

In [None]:
df_gids.head()

In [None]:
df_gids.columns

In [None]:
df_gids.describe()

In [None]:
df_gids.info()

In [None]:
# Contar valores nulos en cada columna
print(df_gids.isnull().sum())

In [None]:
#Count the unique symbols in the data
unique_gids_count = df_gids['Symbol'].nunique()
print(f"Cantidad de símbolos únicos: {unique_gids_count}")

In [None]:
from collections import Counter
symbol_column_gids = df_gids['Symbol']
symbol_count_gids = Counter(symbol_column_gids)
repeated_symbols_gids = {symbol: count for symbol, count in symbol_count_gids.items() if count > 1}
print(f'Cantidad de Symbol repetidos: {repeated_symbols_gids}')

In [None]:
from collections import Counter
Name_column_gids = df_gids['Symbol']
Name_count_gids = Counter(Name_column_gids)
repeated_Name_gids = {symbol: count for symbol, count in Name_count_gids.items() if count > 1}
print(f'Cantidad de Names repetidos: {repeated_Name_gids}')

#### acá hay un problema, ya que la idea de los name es que traigan valores únicos como nasdaq-100 o S&P 500, por lo que hay que corregirlo en la ETL, por lo que es ideal hacer una lista previa de estos valores para incluirlos en el ETL

In [167]:
pattern = r'(\bNasdaq-100\b)|(\bNasdaq\b)|(\b[A-Z]+ ETF\b)|(\b[A-Z]+ Muni Bond ETF\b)' +\
        r'|(\bS&P\b)|(\bNASDAQ\b)|(\bThe Capital Strength\b)|(\bSettle\b.*)|(\bFidelity Disruptive\b.*)' + \
        r'|(\bPHLX\b)|(\bThe Capital Strength\b)|(\bOMX\b)|(\bOMRX\b)|(\bFirst North\b)'+\
        r'|(\bDorsey Wright\b)|(\bCompass EMP\b)|(\bGlobal X\b)|(\bOptimal Blue 30Yr\b)'+\
        r'|(\bStrategic Technology & Ecommerce\b)|(\Strategic Hotel & Lodging\b)|(\bStrategic E-Commerce\b)'+\
        r'|(\bStrategic Fintech & Digital Payments\b)|(\bCRSP US\b)'

In [165]:

# Función para agrupar nombres similares
def group_names(name):
    # Expresión regular para buscar patrones en el nombre
    pattern = r'(\bNasdaq-100\b)|(\bNasdaq\b)|(\b[A-Z]+ ETF\b)|(\b[A-Z]+ Muni Bond ETF\b)' +\
            r'|(\bS&P\b)|(\bNASDAQ\b)|(\bThe Capital Strength\b)|(\bSettle\b.*)|(\bFidelity Disruptive\b.*)' + \
            r'|(\bPHLX\b)|(\bThe Capital Strength\b)|(\bOMX\b)|(\bOMRX\b)|(\bFirst North\b)'+\
            r'|(\bDorsey Wright\b)|(\bCompass EMP\b)|(\bGlobal X\b)|(\bOptimal Blue 30Yr\b)'+\
            r'|(\bStrategic Technology & Ecommerce\b)|(\Strategic Hotel & Lodging\b)|(\bStrategic E-Commerce\b)'+\
            r'|(\bStrategic Fintech & Digital Payments\b)|(\bCRSP US\b)'

    match = re.search(pattern, name)
    if match:
        return match.group()
    return name

# Aplicar la función a la columna 'Name' para obtener la nueva columna 'Grouped Name'
df_gids['Grouped Name'] = df_gids['Name'].apply(group_names)

# Mostrar el DataFrame resultante con las columnas 'Symbol', 'Name' y 'Grouped Name'
print(df_gids[['Symbol', 'Name', 'Grouped Name']])


       Symbol                                             Name  \
0        COMP                                 NASDAQ Composite   
1        INDS                                Nasdaq Industrial   
2        BANK                                      Nasdaq Bank   
3        INSR                                 Nasdaq Insurance   
4        OFIN  Nasdaq Real Estate and Other Financial Services   
...       ...                                              ...   
9970     CALY    BlackRock Short-Term California Muni Bond ETF   
9971   XNDX7E           Nasdaq-100 Volatility Control 7% Index   
9972  XNDX10E          Nasdaq-100 Volatility Control 10% Index   
9973  XNDX12E          Nasdaq-100 Volatility Control 12% Index   
9974     JPEF                        JPMorgan Equity Focus ETF   

                                       Grouped Name  
0                                            NASDAQ  
1                                            Nasdaq  
2                                            

In [166]:
unique_grouped_gidsnames = df_gids['Grouped Name'].drop_duplicates()
unique_grouped_gidsnames_list = unique_grouped_gidsnames.tolist()
print(unique_grouped_gidsnames_list)
print(len(unique_grouped_gidsnames_list))

['NASDAQ', 'Nasdaq', 'PHLX', 'The Capital Strength', 'Settle - NASDAQ Composite', 'Settle - NASDAQ-100', 'Settle - NASDAQ Biotechnology', 'Settle - PHLX Gold/Silver Sector', 'Settle - PHLX Semiconductor', 'Settle - PHLX Housing Sector', 'Settle - PHLX Oil Service Sector', 'Settle - PHLX Utility Sector', 'Settle - Australian WCO', 'Settle - British Pounds WCO', 'Settle - Canadian Dollars WCO', 'Settle - Euros WCO', 'Settle - Japanese Yen WCO', 'Settle - Swiss Francs WCO', 'Settle - New Zealand WCO', 'OMX', 'OMX_Baltic_Benchmark_PI', 'OMX_Baltic_GI', 'OMX_Baltic_PI', 'Dorsey Wright', 'Endowment Index', 'First North', 'N Energy EUR GI', 'N Energy EUR PI', 'N Basic Materials EUR GI', 'N Basic Materials EUR PI', 'N Chemicals EUR GI', 'N Chemicals EUR PI', 'N Basic Resources EUR GI', 'N Basic Resources EUR PI', 'N Industrials EUR GI', 'N Industrials EUR PI', 'N Construction and Materials EUR GI', 'N Construction and Materials EUR PI', 'N Industrial Goods and Services EUR GI', 'N Industrial G

In [156]:
# Función para buscar nombres similares a "S&P 500"
def is_like_sp500(name):
    return bool(re.search(r'S&P', name, re.IGNORECASE))

# Filtrar el DataFrame para obtener los nombres similares a "S&P 500"
filtered_df = df_gids[df_gids['Name'].apply(is_like_sp500)]

# Mostrar el DataFrame resultante con los nombres similares a "S&P 500"
print(filtered_df[['Symbol', 'Name','Grouped Name']])

        Symbol                                               Name  \
9381      PSCD  PowerShares S&P SmallCap Consumer Discretionar...   
9382      PSCC      PowerShares S&P SmallCap Consumer Staples Ptf   
9383      PSCE          PowerShares S&P SmallCap Energy Portfolio   
9384      PSCF      PowerShares S&P SmallCap Financials Portfolio   
9385      PSCH     PowerShares S&P SmallCap Health Care Portfolio   
9386      PSCI     PowerShares S&P SmallCap Industrials Portfolio   
9387      PSCT      PowerShares S&P SmallCap Information Tech Ptf   
9388      PSCM       PowerShares S&P SmallCap Materials Portfolio   
9389      PSCU       PowerShares S&P SmallCap Utilities Portfolio   
9392      EMIF  iShares S&P Emerging Markets Infrastructure Ix Fd   
9393      ICLN         iShares S&P Global Clean Energy Index Fund   
9394      WOOD    iShares S&P Global Timber & Forestry Index Fund   
9397      ISHG    iShares S&P/Citigroup 1-3 Yr Intl Treasury Bond   
9398      IGOV  iShares S&P/Citigr

In [164]:
pattern = r'(\bNasdaq-100\b)|(\bNasdaq\b)|(\b[A-Z]+ ETF\b)|(\b[A-Z]+ Muni Bond ETF\b)' +\
        r'|(\bS&P\b)|(\bNASDAQ\b)|(\bThe Capital Strength\b)|(\bSettle\b.*)|(\bFidelity Disruptive\b.*)' + \
        r'|(\bPHLX\b)|(\bThe Capital Strength\b)|(\bOMX\b)|(\bOMRX\b)|(\bFirst North\b)'+\
        r'|(\bDorsey Wright\b)|(\bCompass EMP\b)|(\bGlobal X\b)|(\bOptimal Blue 30Yr\b)'+\
        r'|(\bStrategic Technology & Ecommerce\b)|(\Strategic Hotel & Lodging\b)|(\bStrategic E-Commerce\b)'+\
        r'|(\bStrategic Fintech & Digital Payments\b)|(\bCRSP US\b)'

match = re.search(pattern, 'iShares S&P Global Clean Energy Index Fund')
if match:
    print(match.group())


S&P
