In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob, os, random
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

np.random.seed(0)
sns.set_style("darkgrid")

def my_covar(x, y):
    return my_mean(x * y) - my_mean(x) * my_mean(y)

def my_var(x):
    return np.nansum( (np.array(x) - my_mean(x)) ** 2) / len(x)
    
def my_mean(x):
    return np.nansum(x) / len(x)

def my_corr(x, y):
    numerator = my_covar(x, y)
    denom = np.sqrt(my_var(x)) * np.sqrt(my_var(y))
    return numerator / denom

def custom_r2(true, pred):
    rss = np.nansum((true - pred) ** 2)
    tss = np.nansum((true - my_mean(true)) ** 2)
    return 1 - (rss / tss)

def plot_price_over_time(df, symbol):
    fig, axs = plt.subplots(3, 1, figsize = (15, 9), sharex = True)
    df = df[df["symbol"] == symbol].sort_values(by = ["date"])
    print(list(df))
    axs[0].set_ylabel("Stock Price (USD)")
    axs[0].set_xlabel("Time")
    axs[0].plot(df.date, df.stockPrice, ls = "--", lw = 2, color = "black", alpha = 0.8, label = "Price")
    axs[0].legend()

    axs[1].plot(df.date, df.marketCapitalization)
    print(f"{my_corr(df.stockPrice, df.marketCapitalization) = }")
    plt.show()

def get_outlier_idxs(feature, feature_name):
    
    use_mad = True
    use_quantile = False
    ignore_pct = 0.01

    if(use_mad):
        magic_c = 0.6745
        cutoff_value = 8.0 
        mad = np.nanmedian(np.abs(feature - np.nanmedian(feature)))
        mi_feature = (magic_c * (feature - np.nanmedian(feature))) / mad
        outliers = np.where(np.abs(mi_feature) >= cutoff_value)[0]
        print(f"{feature_name} Found {len(outliers)} outliers out of {feature.shape[0]} -- {len(outliers) / feature.shape[0] * 100: .2f}%")
    

    elif(use_quantile):
        lower = np.nanquantile(feature, ignore_pct)
        upper = np.nanquantile(feature, 1 - ignore_pct) 
        outliers = np.where((feature >= upper) | (feature <= lower))[0]
        print(f"{feature_name} Found {len(outliers)} outliers out of {feature.shape[0]} -- {len(outliers) / feature.shape[0] * 100: .2f}%")

    return outliers
    
x = [123, 4, 1, 2 ,4, 2]
y = [1, 2, 3, 4, 5, 6]
x = np.array(x)
y = np.array(y)

# print(my_var(x))
# print(my_corr(x, y))

# print(np.corrcoef(x, y))
# print(r2_score(x, y))
# print(custom_r2(x, y))
# corr = covar(x, y) / std(x) * std(y)

In [None]:
df = pd.read_parquet("/Users/lselig/selig-fa/finance/.data/evs_ratios.parquet")
# df["year"] = pd.DatetimeIndex(df["date"]).year
# df = df[df.symbol.isin(["AAPL", "GOOGL", "MSFT", "GME", "A", "QQQ", "AMZN", "TSLA"])]
# df = df[df.year >= 2015]
df = df[(df.stockPrice >= 2) & (df.stockPrice <= 1000)]
remove_me = []
for col in list(df):
    num_na = df[col].isna().sum().sum()
    print(col, num_na)
    if(num_na > 30000):
        remove_me.append(col)

df = df.drop(columns = remove_me)
df = df.dropna()
print(df.shape)
# plt.scatter(df.stockPrice * df.numberOfShares, df.marketCapitalization)
# plt.show()

# plot_price_over_time(df, "MSFT")

meta_cols = ["year", "symbol", "date", "quarter", "cik"]
drop_me_experimental = ["priceEarningsToGrowthRatio", "numberOfShares", 
                        "quickRatio", "daysOfSalesOutstanding", 
                        "effectiveTaxRate", "freeCashFlowOperatingCashFlowRatio"]
df = df.drop(columns = meta_cols)
df = df.drop(columns = drop_me_experimental)
features = df
ignore_me = ["buySellRatio", "totalBought", "totalSold",
             "averageBought", "averageSold", "pPurchases",
             "sSales", "purchases", "sales"]
# ignore_me = ignore_me + ignore_me_experimental

In [None]:
outlier_idxs = []
majority_outliers = {}
for i, feature in enumerate(features):
    # print(f"{feature = } -- {np.corrcoef(features[feature].values, labels)[0, 1]:.4f}")
    if(feature not in ignore_me):
        result = get_outlier_idxs(features[feature].values, feature)
        for idx in result:
            if(idx not in majority_outliers):
                majority_outliers[idx] = 1
            else:
                majority_outliers[idx] += 1

        print(result)
        outlier_idxs.append(result)

lives = 3
remove_me = []
plt.hist(majority_outliers.values(), bins = 50)
plt.show()
for key in majority_outliers:
    print(majority_outliers[key] >= lives, majority_outliers[key])
    if(majority_outliers[key] >= lives):
        remove_me.append(key)
        
# remove_me = set().union(*outlier_idxs)
print(f"Killing {len(remove_me)} rows out of {len(features)}")
bad_idx = list(remove_me)

labels = df["stockPrice"].values
df = df.reset_index(drop = True)
bad_df = df.index.isin(bad_idx)
df = df[~bad_df]

print(df.shape)
labels = df.stockPrice
features = df.drop(columns = ["stockPrice"])
corr = features.corr()
sns.heatmap(corr, annot = False, xticklabels=False, yticklabels=False)
plt.show()
fig, axs = plt.subplots(2, 1, figsize = (15, 9))
axs[0].scatter(df.stockPrice, df.marketCapitalization)
# %matplotlib widget
# plt.hist(df.stockPrice)


In [None]:

for i, feature in enumerate(features):
    print(f"{feature = } -- {np.corrcoef(features[feature].values, labels)[0, 1]:.4f}")
    plt.scatter(features[feature], labels)
    plt.show()
    # if(feature not in ignore_me):
    #     result = get_outlier_idxs(features[feature].values, feature)
    #     print(result)

In [None]:
np.random.seed(1)
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size = 0.8)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# print(y_train[:5])
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(y_test[:20].values)

print(y_pred[:20])

my_r2 = r2_score(y_test, y_pred)
my_mae = mean_absolute_error(y_test, y_pred)
my_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
my_mape = np.sqrt(mean_absolute_percentage_error(y_test, y_pred))
print(f"{my_r2 = }")
print(f"{my_mae = }")
print(f"{my_rmse = }")
print(f"{my_mape = }")

In [None]:
plt.close()
%matplotlib inline
plt.hist2d(y_test, y_pred, bins = 400)
plt.show()