In [None]:
import pandas as pd
from sqlalchemy import create_engine
from dash import Dash, html, dcc, callback, Output, Input
import plotly.express as px
import pandas as pd
import numpy as np
import re
import dotenv
import os

In [None]:
# --- Connection Parameters ---
dotenv.load_dotenv()


# --- Connection Parameters ---
DB_PARAMS = {
    "host": os.getenv("IP"),
    "database": "postgres",
    "user": "postgres",
    "password": os.getenv("PASSWORD"),
}

# 1. Create the database connection URL
db_url = (
    f"postgresql+psycopg2://{DB_PARAMS['user']}:{DB_PARAMS['password']}"
    f"@{DB_PARAMS['host']}/{DB_PARAMS['database']}"
)

engine = create_engine(db_url)

def read_table(table_name):
    sql_query = f"SELECT * FROM {table_name};"
    df = pd.read_sql_query(sql_query, engine)
    return df

def create_and_populate_table(df: pd.DataFrame, table_name: str, schema: str, exists: str = 'replace'):
    # validate schema name to avoid SQL injection
    if not re.match(r'^[A-Za-z0-9_]+$', schema):
        raise ValueError("Invalid schema name. Allowed characters: letters, numbers, underscore.")
    # create schema if not exists (use exec_driver_sql to run raw SQL string)
    with engine.begin() as conn:
        conn.exec_driver_sql(f"CREATE SCHEMA IF NOT EXISTS {schema};")
    # populate table
    df.to_sql(table_name, engine, schema=schema, if_exists=exists, index=False)

Treino modelo:

* Filme.FilmeNome
* Filme.DuracaoMin
* Filme.AnoDeLancamento
* Filme.GeneroNome
* Endereco.Estado
* Avaliacao.Nota (mean)
* Avaliacao (count)



In [8]:
dw_endereco = read_table("dw_alv.endereco")
dw_endereco

Unnamed: 0,enderecosk,estado
0,1,Hawaii
1,2,Oregon
2,3,Delaware
3,4,Kansas
4,6,Ohio
5,7,West Virginia
6,8,Idaho
7,10,Vermont
8,11,Nebraska
9,12,Minnesota


In [9]:
df_imdb = pd.read_parquet('aws/imdb_movies.parquet')
df_imdb

Unnamed: 0,tconst,primarytitle,startyear,runtimeminutes,genres,averagerating,numvotes
0,tt37561269,Raju Gaani Savaal,2025.0,113.0,Action,9.6,1078
1,tt12119248,Manmauji,2024.0,138.0,Drama,9.6,757
2,tt34000241,Mannu Kya Karegga,2025.0,141.0,Drama,9.5,3103
3,tt36460794,Kousalya Tanaya Ragava,2025.0,147.0,\N,9.5,1212
4,tt33505969,Irudhi Muyarchi,2025.0,121.0,\N,9.5,1042
...,...,...,...,...,...,...,...
37371,tt0023878,Central Airport,1933.0,72.0,Drama,6.2,501
37372,tt0076754,The Swindle,1977.0,99.0,Action,6.2,501
37373,tt6504868,Sadie,2018.0,96.0,Drama,6.2,500
37374,tt4256516,"America, Here We Come",2014.0,90.0,Comedy,6.2,500


In [10]:
rename_map = {
    'primarytitle': 'FilmeNome',
    'startyear': 'AnoDeLancamento',
    'runtimeminutes': 'DuracaoMin',
    'genres': 'GeneroNome',
    'averagerating': 'IMDbAvaliacao',
    'numvotes': 'IMDbNumVotos',
}

df_model_input = df_imdb.rename(columns=rename_map)[list(rename_map.values())]

df_model_input = df_model_input[
    (df_model_input['IMDbAvaliacao'] >= 8)
    & (df_model_input['IMDbNumVotos'] > 1000)
]

estados = dw_endereco['estado'].unique().tolist()

df_model_output = []
for estado in estados:
    df_temp = df_model_input.copy()
    df_temp['Estado'] = estado
    df_temp['PredicaoModelo'] = np.random.rand(len(df_temp)) * 5  # Simulated model prediction
    df_temp = df_temp[df_temp['PredicaoModelo'] >= 3.0]
    df_model_output.append(df_temp)

df_model_output = pd.concat(df_model_output, ignore_index=True)

df_model_output = df_model_output.reset_index()
df_model_output = df_model_output.rename(columns={'index': 'FilmeIMDbSK'})

df_model_output

Unnamed: 0,FilmeIMDbSK,FilmeNome,AnoDeLancamento,DuracaoMin,GeneroNome,IMDbAvaliacao,IMDbNumVotos,Estado,PredicaoModelo
0,0,Raju Gaani Savaal,2025.0,113.0,Action,9.6,1078,Hawaii,3.857475
1,1,Kousalya Tanaya Ragava,2025.0,147.0,\N,9.5,1212,Hawaii,4.309428
2,2,Vidhrohi,2025.0,134.0,Crime,9.4,2080,Hawaii,4.726614
3,3,Golden Opulence: 500 Years of Luxury in Anatolia,2024.0,50.0,Documentary,9.4,1679,Hawaii,4.189924
4,4,Jibon Theke Neya,1970.0,150.0,Drama,9.3,2333,Hawaii,3.056018
...,...,...,...,...,...,...,...,...,...
35525,35525,Stretch and Bobbito: Radio That Changed Lives,2015.0,99.0,Documentary,8.0,1043,Alabama,3.920311
35526,35526,Kaksparsh,2012.0,140.0,Drama,8.0,1028,Alabama,3.793489
35527,35527,Pavithram,1994.0,156.0,Comedy,8.0,1017,Alabama,3.036581
35528,35528,Ryuichi Sakamoto: Opus,2023.0,103.0,Documentary,8.0,1014,Alabama,3.261109


In [16]:
df_model_output = df_model_output.dropna()

In [17]:
create_and_populate_table(df_model_output, table_name="model_infer", schema="imdb_alv", exists='replace')

Output modelo:

* imdb.primarytitle
* imdb.startyear
* imdb.runtimeminutes
* imdb.genres
* imdb.averagerating
* imdb.numvotes
* model.predict(imdb)

Nova tabela Fato:

* FilmeNome
* AnoDeLancamento
* DuracaoMin
* GeneroNome
* Estado
* imdb.averagerating
* imdb.numvotes
* model.predict(imdb)

In [24]:
df_model_output.to_parquet('aws/imdb_model_infer.parquet', index=False)