In [None]:
import pandas as pd

In [None]:
# Read data
df = pd.read_parquet('../../data/data_cleaned_target.parquet',engine='pyarrow')

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['pct_change_close','target']) # Feature without pct change & target
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
import joblib
joblib.dump(model,'../../data/regression_model.pkl')
joblib.dump(X_train, '../../data/X_train.pkl')
joblib.dump(X_test, '../../data/X_test.pkl')
joblib.dump(y_train, '../../data/y_train.pkl')
joblib.dump(y_test, '../../data/y_test.pkl')

In [None]:
def evaluate_model(model, X_test, y_test):
    from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score, classification_report, mean_squared_error
    # ROC AUC
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    l2_error = mean_squared_error(y_test, y_pred)
    
    # Print
    print(f"ROC AUC: {roc_auc}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print(f"L2 error: {l2_error}")
    print("Report",classification_report(y_test, y_pred))

In [None]:
evaluate_model(model, X_test, y_test)

In [None]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test)
# The coefficients
print("Coefficients: \n", model.coef_)

In [None]:
from sklearn.decomposition import PCA

# PCA 20 dimension
pca = PCA(n_components=20)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print("'Variance' for each component:", pca.explained_variance_ratio_)

In [None]:
# Principal components (coefficients)
components = pd.DataFrame(pca.components_, columns=X.columns)
print("Principal components with coefficients:")
components

In [None]:
pc1_sorted = components.iloc[0].sort_values(ascending=False)
print('Most important parameters for the first component')
pc1_sorted

In [None]:
# Random Forest 
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_pca,y_train)

In [None]:
evaluate_model(rf, X_test_pca, y_test)

In [None]:
joblib.dump(rf,'../../data/rf_model.pkl')
joblib.dump(X_train_pca, '../../data/X_train_pca_20.pkl')
joblib.dump(X_test_pca, '../../data/X_test_pca_20.pkl')