In [None]:
# Mount Google Drive, load API keys for Quandl, and set up environment variables for API access
from google.colab import drive
import json

drive.mount('/content/drive')

from pathlib import Path
data_path = Path('drive', 'MyDrive', 'ML4T', 'data.h5')
assets_path = Path('drive', 'MyDrive', 'ML4T', 'assets.h5')

Mounted at /content/drive


# Predicting stock price moves with Logistic Regression

## Imports & Settings

In [None]:
# Data manipulation and analysis
from pathlib import Path
import sys, os
from time import time
import pandas as pd
import numpy as np

# Statistical analysis
from scipy.stats import spearmanr

# Machine learning
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Custom utilities
# Add the path to your Google Drive
google_drive_path = '/content/drive/MyDrive/ML4T'
sys.path.append(google_drive_path)

# Now import MultipleTimeSeriesCV from utils.py in your Google Drive
from utils import MultipleTimeSeriesCV

# Set seaborn style and pandas display options
sns.set_style('darkgrid')
idx = pd.IndexSlice

# Define constants
YEAR = 252  # Number of trading days in a year

## Load Data

In [None]:
# Load data from HDF5 store
with pd.HDFStore(data_path) as store:
    data = (store['model_data']
            .dropna()
            .drop(['open', 'close', 'low', 'high'], axis=1))

# Remove year and lag columns
data = data.drop([c for c in data.columns if 'year' in c or 'lag' in c], axis=1)


In [None]:
### Select Investment Universe

# Filter for top 100 stocks by dollar volume
data = data[data.dollar_vol_rank < 100]

In [None]:
### Create Model Data

# Separate features and target variables
y = data.filter(like='target')
X = data.drop(y.columns, axis=1)

# Remove unnecessary columns
X = X.drop(['dollar_vol', 'dollar_vol_rank', 'volume'], axis=1)

## Logistic Regression

### Define cross-validation parameters

In [None]:
# Set up cross-validation parameters
train_period_length = 63
test_period_length = 10
lookahead = 1
n_splits = int(3 * YEAR / test_period_length)

# Create custom cross-validation object
cv = MultipleTimeSeriesCV(n_splits=n_splits,
                          test_period_length=test_period_length,
                          lookahead=lookahead,
                          train_period_length=train_period_length)


In [None]:
# Define target variable
target = f'target_{lookahead}d'

# Create binary labels for logistic regression
y.loc[:, 'label'] = (y[target] > 0).astype(int)
y.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,56610
0,53138


In [None]:
# Define regularization parameter values to test
Cs = np.logspace(-5, 5, 11)

# Define columns for results dataframe
cols = ['C', 'date', 'auc', 'ic', 'pval']

### Run cross-validation

In [None]:
%%time
# Initialize dictionaries and lists to store results
log_coeffs, log_scores, log_predictions = {}, [], []

# Loop through different regularization parameter values
for C in Cs:
    print(C)

    # Create logistic regression model
    model = LogisticRegression(C=C,
                               fit_intercept=True,
                               random_state=42,
                               n_jobs=-1)

    # Create pipeline with standardization and model
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)])

    ics = aucs = 0
    start = time()
    coeffs = []

    # Perform cross-validation
    for i, (train_idx, test_idx) in enumerate(cv.split(X), 1):
        # Split data into train and test sets
        X_train, y_train, = X.iloc[train_idx], y.label.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.label.iloc[test_idx]
        actuals = y[target].iloc[test_idx]

        # Skip if not enough data or unique labels
        if len(y_test) < 10 or len(np.unique(y_test)) < 2:
            continue

        # Fit model and make predictions
        pipe.fit(X=X_train, y=y_train)
        y_score = pipe.predict_proba(X_test)[:, 1]

        # Calculate performance metrics
        auc = roc_auc_score(y_score=y_score, y_true=y_test)
        ic, pval = spearmanr(y_score, actuals)

        # Store results
        log_predictions.append(y_test.to_frame('labels').assign(
            predicted=y_score, C=C, actuals=actuals))
        date = y_test.index.get_level_values('date').min()
        log_scores.append([C, date, auc, ic * 100, pval])
        coeffs.append(pipe.named_steps['model'].coef_)

        # Update cumulative metrics
        ics += ic
        aucs += auc

        # Print progress every 10 iterations
        if i % 10 == 0:
            print(f'\t{time()-start:5.1f} | {i:03} | {ics/i:>7.2%} | {aucs/i:>7.2%}')

    # Store average coefficients for this C value
    log_coeffs[C] = np.mean(coeffs, axis=0).squeeze()

1e-05
	 10.7 | 010 |  -1.07% |  49.93%
	 21.4 | 020 |   1.21% |  51.41%
	 32.2 | 030 |   2.19% |  51.69%
	 42.9 | 040 |   2.88% |  51.74%
	 53.8 | 050 |   3.89% |  52.38%
	 64.7 | 060 |   3.84% |  52.21%
	 75.7 | 070 |   4.66% |  52.58%
0.0001
	 10.8 | 010 |  -1.02% |  50.02%
	 21.6 | 020 |   1.32% |  51.46%
	 32.4 | 030 |   2.49% |  51.86%
	 43.1 | 040 |   2.91% |  51.77%
	 53.8 | 050 |   3.95% |  52.43%
	 65.2 | 060 |   3.92% |  52.25%
	 76.0 | 070 |   4.78% |  52.64%
0.001
	 10.8 | 010 |  -0.98% |  50.00%
	 21.7 | 020 |   1.45% |  51.42%
	 32.7 | 030 |   2.92% |  52.00%
	 43.4 | 040 |   2.73% |  51.66%
	 54.2 | 050 |   3.91% |  52.40%
	 65.2 | 060 |   4.04% |  52.27%
	 76.0 | 070 |   4.89% |  52.67%
0.01
	 11.0 | 010 |  -0.72% |  50.09%
	 21.8 | 020 |   1.29% |  51.24%
	 32.8 | 030 |   2.84% |  51.85%
	 43.7 | 040 |   2.61% |  51.53%
	 54.6 | 050 |   3.88% |  52.32%
	 65.4 | 060 |   4.05% |  52.18%
	 76.3 | 070 |   4.75% |  52.52%
0.1
	 11.0 | 010 |  -0.70% |  50.16%
	 22.0 | 020 | 

### Evaluate Results

In [None]:
# Convert results to DataFrames and save to HDF5 store
log_scores = pd.DataFrame(log_scores, columns=cols)
log_scores.to_hdf(data_path, 'logistic/scores')

log_coeffs = pd.DataFrame(log_coeffs, index=X.columns).T
log_coeffs.to_hdf(data_path, 'logistic/coeffs')

log_predictions = pd.concat(log_predictions)
log_predictions.to_hdf(data_path, 'logistic/predictions')

# Read scores from HDF5 store
log_scores = pd.read_hdf(data_path, 'logistic/scores')

# Display information about the scores DataFrame
log_scores.info()



TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [None]:
# Show summary statistics for AUC grouped by C
log_scores.groupby('C').auc.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1e-05,75.0,0.524324,0.036288,0.426532,0.501266,0.522529,0.542917,0.622403
0.0001,75.0,0.524953,0.035701,0.43864,0.503316,0.523812,0.541848,0.634606
0.001,75.0,0.525362,0.036783,0.454459,0.503362,0.52276,0.545295,0.658996
0.01,75.0,0.524191,0.036936,0.4473,0.501095,0.520992,0.548866,0.658722
0.1,75.0,0.522704,0.036847,0.441199,0.4979,0.519522,0.54616,0.645193
1.0,75.0,0.52229,0.036786,0.439031,0.498218,0.51871,0.546174,0.640843
10.0,75.0,0.522243,0.036755,0.438842,0.498171,0.518579,0.546178,0.640034
100.0,75.0,0.522239,0.036747,0.438804,0.498177,0.518587,0.546184,0.640004
1000.0,75.0,0.522239,0.036751,0.4388,0.498181,0.518587,0.546188,0.640009
10000.0,75.0,0.522239,0.036748,0.438804,0.498177,0.518587,0.546192,0.639966


### Plot Validation Scores

In [None]:
# Function to plot IC distribution
def plot_ic_distribution(df, ax=None):
    if ax is not None:
        sns.distplot(df.ic, ax=ax)
    else:
        ax = sns.distplot(df.ic)
    mean, median = df.ic.mean(), df.ic.median()
    ax.axvline(0, lw=1, ls='--', c='k')
    ax.text(x=.05, y=.9, s=f'Mean: {mean:8.2f}\nMedian: {median:5.2f}',
            horizontalalignment='left',
            verticalalignment='center',
            transform=ax.transAxes)
    ax.set_xlabel('Information Coefficient')
    sns.despine()
    plt.tight_layout()

# Create plot with two subplots
fig, axes = plt.subplots(ncols=2, figsize=(15, 5))

# Plot AUC vs C
sns.lineplot(x='C', y='auc', data=log_scores, estimator=np.mean, label='Mean', ax=axes[0])
by_alpha = log_scores.groupby('C').auc.agg(['mean', 'median'])
best_auc = by_alpha['mean'].idxmax()
by_alpha['median'].plot(logx=True, ax=axes[0], label='Median', xlim=(10e-6, 10e5))
axes[0].axvline(best_auc, ls='--', c='k', lw=1, label='Max. Mean')
axes[0].axvline(by_alpha['median'].idxmax(), ls='-.', c='k', lw=1, label='Max. Median')
axes[0].legend()
axes[0].set_ylabel('AUC')
axes[0].set_xscale('log')
axes[0].set_title('Area Under the Curve')

# Plot IC distribution for best C value
plot_ic_distribution(log_scores[log_scores.C==best_auc], ax=axes[1])
axes[1].set_title('Information Coefficient')

# Set overall title and adjust layout
fig.suptitle('Logistic Regression', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9)