In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import yfinance as yf
from datetime import datetime
# import autosklearn.classification
import seaborn as sns

RANDOM_SEED = 33

# Importando dados

In [2]:
PATH_PETR4_PRICES_TWEETS = 'TimeSeries/PETR4_prices_and_tweets_NLI_scores.csv'

In [3]:
df_petr4 = pd.read_csv(PATH_PETR4_PRICES_TWEETS)

In [4]:
df_petr4['Date'] = pd.to_datetime(df_petr4['Date'])

In [5]:
def extract_target_columns(df):
    df_aux = df.copy()
    close_price_per_day_1_day_shifted = df_aux['Close'][1:]
    close_price_per_day_1_day_shifted = pd.concat([close_price_per_day_1_day_shifted, pd.Series(np.nan)], ignore_index=True)
    # Incluindo o valor do fechamento do dia seguinte no dataframe auxiliar
    df_aux['Close for next day'] = close_price_per_day_1_day_shifted
    # Removendo o útlimo dia de coleta de dados por não sabermos o preço de fechamento do dia seguinte
    df_aux = df_aux[:-1]

    df_aux['Close one day variation'] = df_aux['Close for next day'] - df_aux['Close']
    df_aux['hasRise'] = df_aux['Close one day variation'] > 0
    df_aux['hasRise'] = df_aux['hasRise'].map({True:1, False:0})

    return df_aux

In [6]:
df_petr4_with_targets = extract_target_columns(df_petr4)

In [7]:
df_petr4_with_targets.isna().sum()[df_petr4_with_targets.isna().sum() > 0]

Series([], dtype: int64)

In [8]:
df_petr4_with_targets

Unnamed: 0,Date,stock,valorização_mean,valorização_median,valorização_q1,valorização_q3,valorização_count,bom negócio_mean,bom negócio_median,bom negócio_q1,...,Lucro Líquido - (R$),Lucro atribuído a Controladora,Lucro atribuído a Não Controladores,Dívida Bruta - (R$),Margem Bruta - (%),Margem Ebitda - (%),Margem Líquida - (%),Close for next day,Close one day variation,hasRise
0,2022-06-01,PETR4,0.501828,0.501828,0.253213,0.750443,2,0.491784,0.491784,0.367215,...,0.2166,0.2192,-0.3063,0.0116,0.0621,0.1461,0.0079,0.995318,-0.008696,0
1,2022-06-02,PETR4,0.501828,0.501828,0.253213,0.750443,2,0.491784,0.491784,0.367215,...,0.2166,0.2192,-0.3063,0.0116,0.0621,0.1461,0.0079,1.012709,0.017391,1
2,2022-06-03,PETR4,0.501828,0.501828,0.253213,0.750443,2,0.491784,0.491784,0.367215,...,0.2166,0.2192,-0.3063,0.0116,0.0621,0.1461,0.0079,1.013378,0.000669,1
3,2022-06-06,PETR4,0.501828,0.501828,0.253213,0.750443,2,0.491784,0.491784,0.367215,...,0.2166,0.2192,-0.3063,0.0116,0.0621,0.1461,0.0079,1.025418,0.012040,1
4,2022-06-07,PETR4,0.501828,0.501828,0.253213,0.750443,2,0.491784,0.491784,0.367215,...,0.2166,0.2192,-0.3063,0.0116,0.0621,0.1461,0.0079,1.020401,-0.005017,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,2024-05-23,PETR4,0.552537,0.598493,0.172290,0.978739,4,0.479243,0.515898,0.385444,...,-0.1102,-0.1099,-0.1852,0.0115,-0.0296,-0.0025,-0.0564,1.224415,-0.006689,0
494,2024-05-24,PETR4,0.497588,0.438173,0.256983,0.770160,9,0.628261,0.685696,0.327318,...,-0.1102,-0.1099,-0.1852,0.0115,-0.0296,-0.0025,-0.0564,1.237793,0.013378,1
495,2024-05-27,PETR4,0.536770,0.565749,0.308531,0.825538,10,0.555506,0.534227,0.389310,...,-0.1102,-0.1099,-0.1852,0.0115,-0.0296,-0.0025,-0.0564,1.264214,0.026421,1
496,2024-05-28,PETR4,0.449981,0.393969,0.076732,0.845444,22,0.468577,0.530161,0.225760,...,-0.1102,-0.1099,-0.1852,0.0115,-0.0296,-0.0025,-0.0564,1.262542,-0.001672,0


In [9]:
df_petr4_with_targets.value_counts('hasRise')

hasRise
1    271
0    227
Name: count, dtype: int64

## Pré-processamento

In [10]:
df_petr4_with_targets = df_petr4_with_targets.drop('stock', axis='columns')

Configurando a coluna de Date como index

In [11]:
df_petr4_with_targets = df_petr4_with_targets.set_index('Date', drop=True)

Todas as colunas estão com o dtype correto:

In [12]:
print(df_petr4_with_targets.dtypes.to_string())

valorização_mean                         float64
valorização_median                       float64
valorização_q1                           float64
valorização_q3                           float64
valorização_count                          int64
bom negócio_mean                         float64
bom negócio_median                       float64
bom negócio_q1                           float64
bom negócio_q3                           float64
bom negócio_count                          int64
lucro_mean                               float64
lucro_median                             float64
lucro_q1                                 float64
lucro_q3                                 float64
lucro_count                                int64
neutro_mean                              float64
neutro_median                            float64
neutro_q1                                float64
neutro_q3                                float64
neutro_count                               int64
desvalorização_mean 

### Divisão em treino (para validação cruzada) e teste

Em uma situação real, o teste são os dias mais recentes. Vamos separar cerca de 25% do dataset para teste, o que equivale aos últimos 6 meses.

In [13]:
def split_df(df, split_day='2023-12-01'):
    df_train = df[:split_day]
    df_test = df[split_day:]
    print(f'train.shape: {df_train.shape}, test.shape:{df_test.shape}')

    return df_train, df_test

In [14]:
df_petr4_train, df_petr4_test = split_df(df_petr4_with_targets)

train.shape: (377, 65), test.shape:(122, 65)


## Normalizando os dados

In [15]:
from sklearn.preprocessing import StandardScaler

def normalize_and_separe_X_and_y(df_train, df_test):

    # Separando coluna target antes da normalização
    hasRise_train = df_train['hasRise']
    hasRise_test = df_test['hasRise']

    std_scaler = StandardScaler()
    std_scaler = std_scaler.fit(df_train.drop('hasRise', axis='columns'))
    train_norm = std_scaler.transform(df_train.drop('hasRise', axis='columns'))
    test_norm = std_scaler.transform(df_test.drop('hasRise', axis='columns'))

    df_train_norm = pd.DataFrame(train_norm, columns=df_train.drop('hasRise', axis='columns').columns)
    df_test_norm = pd.DataFrame(test_norm, columns=df_test.drop('hasRise', axis='columns').columns)
    
    # Separando colunas de target após normalização
    cols2drop = ['Close for next day', 'Close one day variation']
    close_train_norm = df_train_norm['Close for next day'] # train
    closeVariation_train_norm = df_train_norm['Close one day variation']
    X_train_norm = df_train_norm.drop(cols2drop, axis='columns')

    close_test_norm = df_test_norm['Close for next day'] # test
    closeVariation_test_norm = df_test_norm['Close one day variation']
    X_test_norm = df_test_norm.drop(cols2drop, axis='columns')

    
    return X_train_norm, close_train_norm, closeVariation_train_norm, hasRise_train, \
            X_test_norm, close_test_norm, closeVariation_test_norm, hasRise_test

In [16]:
(X_petr4_train_norm, close_petr4_train_norm, closeVariation_petr4_train_norm, hasRise_petr4_train, \
 X_petr4_test_norm, close_petr4_test_norm, closeVariation_petr4_test_norm, hasRise_petr4_test) = \
    normalize_and_separe_X_and_y(df_petr4_train, df_petr4_test)

In [17]:
# Conferindo dimensões
(X_petr4_train_norm.shape, close_petr4_train_norm.shape, closeVariation_petr4_train_norm.shape, hasRise_petr4_train.shape), '\n',\
(X_petr4_test_norm.shape, close_petr4_test_norm.shape, closeVariation_petr4_test_norm.shape, hasRise_petr4_test.shape)

(((377, 62), (377,), (377,), (377,)),
 '\n',
 ((122, 62), (122,), (122,), (122,)))

# Modelagem

Observando a quantidade de eventos no conjunto de treino/validação e teste. Podemos concluir que os modelos, por serem treinados com um dataset levemente desbalanceado, possuem maior viés para indicar a predição positiva `hasRise=1`, devido que, em geral, 

In [24]:
hasRise_petr4_train.value_counts(normalize=True)

hasRise
1    0.541114
0    0.458886
Name: proportion, dtype: float64

In [25]:
hasRise_petr4_test.value_counts(normalize=True)

hasRise
1    0.54918
0    0.45082
Name: proportion, dtype: float64

## XGBoost

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier
import optuna

def objective(trial):
    # Definir o espaço de busca dos hiperparâmetros
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'tree_method': 'auto',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'eta': trial.suggest_float('eta', 1e-4, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-2, 1e1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-4, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-4, 1.0, log=True),
    }
    
    # Configurando o KFold
    kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    
    # Inicializando o modelo XGBoost
    model = XGBClassifier(**param)
    
    # Realizando a validação cruzada
    scores = cross_val_score(model, X_petr4_train_norm, hasRise_petr4_train, cv=kf, scoring='accuracy')
    
    # Retornando a média dos scores como objetivo a ser minimizado
    return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


### Avaliação 

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
study.best_trials

In [None]:
sorted_trials_df = study.trials_dataframe().sort_values('value', ascending=False)

In [None]:
sorted_trials = [study.trials[tid] for tid in sorted_trials_df['number']]

In [None]:
best_trials = sorted_trials[:3]

results = []

for i, trial in enumerate(best_trials):
    best_params = trial.params
    model = XGBClassifier(**best_params)
    
    # Treinando o modelo no conjunto de treinamento completo
    model.fit(X_petr4_train_norm, hasRise_petr4_train)
    
    # Fazendo previsões no conjunto de teste
    y_pred = model.predict(X_petr4_test_norm)
    
    # Avaliando o modelo
    accuracy = accuracy_score(hasRise_petr4_test, y_pred)
    report = classification_report(hasRise_petr4_test, y_pred, output_dict=True)
    
    results.append({
        'trial_number': trial.number,
        'accuracy': accuracy,
        'report': report
    })
    
    print(f"Results for model {i+1}:")
    print(f"Trial number: {trial.number}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(classification_report(hasRise_petr4_test, y_pred))


# AutoML

In [None]:
train_time_seconds = 4*60*60 # 4 horas
per_run_time_limit = 0.1*train_time_seconds



automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=train_time_seconds,  # total time in seconds
                                                          per_run_time_limit=per_run_time_limit,       # time limit for each model
                                                          ensemble_nbest=20,
                                                          initial_configurations_via_metalearning=25,
                                                          seed=RANDOM_SEED,
                                                          n_jobs=-1)


# automl.fit(np.array(X_petr4_train_norm).tolist(),
#             np.array(hasRise_petr4_train).tolist())

automl.fit(X_petr4_train_norm.to_numpy().astype(float),hasRise_petr4_train.to_numpy(dtype=np.int64).astype(int))

In [None]:
X_petr4_train_norm.to_numpy()

In [None]:
X.dtype

In [None]:
np.array(X_petr4_train_norm).tolist()[1][1]

In [None]:
np.array(X_petr4_train_norm).astype(float).dtype

In [None]:
np.array(X_petr4_train_norm).astype(float)

In [None]:
np.array(X_petr4_test_norm)

In [None]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import numpy as np

# Carregar o conjunto de dados
iris = load_iris()
X = iris.data.astype(float)  # Converter os dados para float
y = iris.target.astype(float)  # Converter os dados para float

In [None]:
iris

In [None]:
X.dtype

In [None]:
y_pred = automl.predict(hasRise_petr4_test)

# Evaluate accuracy
accuracy = accuracy_score(hasRise_petr4_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print the final ensemble constructed by Auto-sklearn
print(automl.show_models())

# Print the statistics of Auto-sklearn
print(automl.sprint_statistics())