In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import glob, os, random
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from scipy.signal import savgol_filter
from scipy import interpolate
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
np.random.seed( 0 )
sns.set_style( "darkgrid" )


In [None]:
df = pd.read_parquet('C:/Users/lselig/selig-fa/finance/.data/fundamental_analysis_neglog.parquet')
df = df[(df.period == "Q2") & (df.calendarYear == "2022")]
df.replace([np.inf, -np.inf], np.nan, inplace = True)
df.dropna(axis=0, inplace = True)
df = df.drop_duplicates()
targets = df.wk_curr_to_next_pct_inc.values

targets_binary = targets >= 1.0 # bools
targets_binary = np.array(targets_binary, dtype = "int")
plt.close()
plt.hist(targets_binary)
plt.show()


print(targets_binary)
tickers = df.symbol.values
periods = df.period.values
calendarYears = df.calendarYear.values
date = df.date.values
qualitative_data = np.array((tickers, periods, calendarYears, date)).T
df = df.drop(columns = ["period", "calendarYear", "symbol", "date", "open_next", "wk_curr_to_next_pct_inc"])
features = df.to_numpy()
print(targets.shape, features.shape)

In [None]:
scaler = StandardScaler()
features =  scaler.fit_transform(features)
features = np.clip(features, -5, 5)
print(features.shape)
n_features = features.shape[1]
print(n_features)
%matplotlib widget 
for i in range(n_features):
    print(i, np.nanmin(features[:, i]), np.nanmax(features[:, i]))
    sns.kdeplot(features[:, i])
plt.title("Distribution of final features")

In [None]:
print(list(df))
print(features.shape, targets.shape)


In [None]:
pca = PCA(n_components=2)
pca.fit(features.T)
print(pca.explained_variance_ratio_)
print(np.sum(pca.explained_variance_ratio_))
# print(pca.singular_values_)
# print(components)
# plt.hist2d(components[:, 0], components[:, 1], bins = 50)
components = pca.components_.T
print(components.shape)
# components[0,:]
plt.close()
plt.hist2d(components[:, 0], components[:, 1], bins = 60)
plt.show()
# plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation

clst = KMeans(n_clusters=3, random_state=0).fit(components)
# clst = AffinityPropagation( random_state=0).fit(components)
labels = clst.labels_.reshape(-1, 1)
pca_and_labels = np.hstack((components, labels))
print(pca_and_labels.shape)
clst.labels_.shape
fig, axs = plt.subplots(1, 2, figsize = (12, 8))
for g in np.unique(labels):
    idxs = np.where(pca_and_labels[:, 2] == g)[0]
    cluster = pca_and_labels[idxs]
    for i, c in enumerate(cluster):
        print(c[0], c[1], qualitative_data[:, 0][idxs[i]])
        axs[0].text(c[0], c[1], s = qualitative_data[:, 0][idxs[i]])
    axs[0].scatter(cluster[:, 0],  cluster[:, 1], label = g)
axs[0].set_xlabel("PCA1")
axs[0].set_ylabel("PCA2")
axs[1].hist2d(pca_and_labels[:, 0], pca_and_labels[:, 1], bins = 40)
axs[1].set_xlabel("PCA1")
axs[1].set_ylabel("PCA2")
axs[0].legend()
plt.show()

    
# plt.scatter(pca_and_labels[:, 0], pca_and_labels[: ,1], group = )
# plt.close()
# plt.hist(kmeans.labels_)
# plt.show()


In [None]:
# the tickers where PCA2 is an outlier are all related to Federal National Mortgage Association (FNMA)
# not sure if it can be exploited
interesting = np.argwhere(pca_and_labels[: ,1] >= 0.08)
interesting_tickers = tickers[interesting]
# print(interesting_tickers)

interesting = np.argwhere(pca_and_labels[: ,1] <= -0.0365)
interesting_tickers = tickers[interesting]
print(interesting_tickers)

In [None]:

interesting = np.argwhere(pca_and_labels[: ,0] >= 0.0055)
interesting_tickers = qualitative_data[interesting]
print(interesting_tickers)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

plt.close()
# good = np.where(targets <= 100)[0]
corrs = [] 
for i in range(n_features):
    f = features[:, i]
    t = targets
    r = np.corrcoef(f, t)[0, 1]
    corrs.append(r)

sns.kdeplot(corrs)

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size = 0.2, random_state = 42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_train)
print(mean_absolute_error(y_pred, y_train))
print(r2_score(y_pred, y_train))



In [29]:
clf = RandomForestClassifier(max_depth=4, random_state=0)
print(targets_binary)
X_train, X_test, y_train, y_test = train_test_split(features, targets_binary, test_size = 0.2, random_state = 42)
print(y_train)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred)
print(classification_report(y_pred, y_test))

[1 1 0 ... 1 1 0]
[0 1 0 ... 0 0 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
              precision    recall  f1-score   support

           0       0.04      0.80      0.07         5
           1       0.99      0.61      0.76       279

    accuracy                           0.61       284
   macro avg       0.51      0.70      0.41       284
weighted avg       0.98      0.61      0.74       284

