In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge, Lasso

In [2]:
df = pd.read_csv('monthly_averages.csv')
df

Unnamed: 0,symbol,Date,Avg Adj Close,Avg Volume,Monthly Return,3 Month Momentum,2 Month Moving Avg,10 Month Moving Avg,Skewness,Volatility,3 Month Volatility,12 Month Volatility,Sharpe Ratio,Next Month Return
0,A,1999-11,25.444636,13161436,0.000000,0.000000,0.000000,0.000000,0.480281,0.063538,0.000000,0.000000,0.000000,0
1,A,1999-12,30.812055,3041291,0.210945,0.000000,28.128345,0.000000,1.857995,0.061724,0.000000,0.000000,3.417564,0
2,A,2000-01,41.053274,2244041,0.332377,0.000000,35.932665,0.000000,-1.188809,0.040263,0.000000,0.000000,8.255103,0
3,A,2000-02,52.513094,1646900,0.279145,1.063818,46.783184,0.000000,0.606447,0.059944,0.060870,0.000000,4.656758,0
4,A,2000-03,74.481552,2507811,0.418342,1.417286,63.497323,0.000000,1.080254,0.113234,0.070237,0.000000,3.694485,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200860,ZTS,2023-07,175.995930,1989400,0.049379,0.025443,171.855183,161.956587,0.727128,0.018508,0.052237,0.048815,2.667901,0
200861,ZTS,2023-08,185.205267,1753765,0.052327,0.049342,180.600598,165.684622,0.141066,0.016494,0.058106,0.049115,3.172562,0
200862,ZTS,2023-09,181.356286,1550230,-0.020782,0.081340,183.280777,169.457315,0.266327,0.011811,0.041385,0.041455,-1.759546,0
200863,ZTS,2023-10,168.568036,1694281,-0.070515,-0.042205,174.962161,171.507564,-0.845039,0.012649,0.061790,0.045853,-5.574917,0


In [3]:
df.describe()

Unnamed: 0,Avg Adj Close,Avg Volume,Monthly Return,3 Month Momentum,2 Month Moving Avg,10 Month Moving Avg,Skewness,Volatility,3 Month Volatility,12 Month Volatility,Sharpe Ratio,Next Month Return
count,200865.0,200865.0,200865.0,200865.0,200865.0,200865.0,200865.0,200865.0,200865.0,200865.0,200865.0,200865.0
mean,45.322623,5303110.0,0.013165,0.042833,45.090075,43.2058,-0.023897,0.019376,0.057499,0.066453,inf,0.0
std,128.078817,25786630.0,0.095284,0.1893,127.329098,121.156521,0.666409,0.018754,0.069054,0.064911,,0.0
min,0.001808,0.0,-0.765625,-0.923403,0.0,0.0,-4.795832,0.0,0.0,0.0,-24.1044,0.0
25%,4.212618,552825.0,-0.026927,-0.042753,4.169622,3.765537,-0.434172,0.011578,0.027614,0.041252,-1.598279,0.0
50%,17.012024,1512555.0,0.013237,0.037379,16.933819,16.305596,-0.027008,0.016044,0.04617,0.057353,0.8798847,0.0
75%,45.138571,3810060.0,0.052495,0.119967,44.951126,43.424389,0.382981,0.022889,0.073013,0.080274,3.538928,0.0
max,6293.452441,1801161000.0,22.154362,25.335878,6247.696892,5820.298921,4.795832,5.746528,12.815429,6.410517,inf,0.0


In [3]:
df['Next Month Return'].value_counts()

0    200865
Name: Next Month Return, dtype: int64

# Problem 2

In [4]:
df['year'] = pd.to_datetime(df['Date']).dt.year
df['month'] = pd.to_datetime(df['Date']).dt.month
df.sort_values(['symbol', 'year', 'month'], inplace=True)
df.drop(['year', 'month'], axis=1, inplace=True)
df = df[(df['Date'] >= '2010-01') & (df['Date'] <= '2022-12')]

# Train-test split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=6)

# Problem 3

In [7]:
features = df.columns[2:-1]
if not train_data[features].replace([np.inf, -np.inf], np.nan).notnull().all().all():
    train_data[features] = train_data[features].replace([np.inf, -np.inf], np.nan)

train_data = train_data.dropna()
scaler = StandardScaler()
train_data[features] = scaler.fit_transform(train_data[features])
if not test_data[features].replace([np.inf, -np.inf], np.nan).notnull().all().all():
    test_data[features] = test_data[features].replace([np.inf, -np.inf], np.nan)


test_data = test_data.dropna()
test_data[features] = scaler.transform(test_data[features])

# Problem 4

In [8]:
results_list = []
for feature in features:
    model = LinearRegression()
    model.fit(train_data[[feature]], train_data['Next Month Return'])
    train_preds = model.predict(train_data[[feature]])
    test_preds = model.predict(test_data[[feature]])

    # Calculate R-squared
    train_r2 = r2_score(train_data['Next Month Return'], train_preds)
    test_r2 = r2_score(test_data['Next Month Return'], test_preds)
    results_list.append({'Feature': feature, 'Train R-squared': train_r2, 'Test R-squared': test_r2})

results_df = pd.DataFrame(results_list)
results_df

Unnamed: 0,Feature,Train R-squared,Test R-squared
0,Avg Adj Close,1.0,1.0
1,Avg Volume,1.0,1.0
2,Monthly Return,1.0,1.0
3,3 Month Momentum,1.0,1.0
4,2 Month Moving Avg,1.0,1.0
5,10 Month Moving Avg,1.0,1.0
6,Skewness,1.0,1.0
7,Volatility,1.0,1.0
8,3 Month Volatility,1.0,1.0
9,12 Month Volatility,1.0,1.0


In [9]:
results_df.to_csv("results.csv", index=False)

# Problem 5

In [10]:
all_features = features
model_all_features = LinearRegression()
model_all_features.fit(train_data[all_features], train_data['Next Month Return'])

train_preds_all_features = model_all_features.predict(train_data[all_features])
test_preds_all_features = model_all_features.predict(test_data[all_features])

train_r2_all_features = r2_score(train_data['Next Month Return'], train_preds_all_features)
test_r2_all_features = r2_score(test_data['Next Month Return'], test_preds_all_features)

print(f"R-squared for Train Data (All Features): {train_r2_all_features}")
print(f"R-squared for Test Data (All Features): {test_r2_all_features}")

R-squared for Train Data (All Features): 1.0
R-squared for Test Data (All Features): 1.0


# Problem 6

In [11]:
squared_features = [f'{feature}_squared' for feature in features]
total_features = [*features, *squared_features]
for feature in features:
    train_data[f'{feature}_squared'] = train_data[feature] ** 2
    test_data[f'{feature}_squared'] = test_data[feature] ** 2

In [12]:
model_all_features_squared = LinearRegression()
model_all_features_squared.fit(train_data[total_features], train_data['Next Month Return'])

# predict
train_preds_all_features_squared = model_all_features_squared.predict(train_data[total_features])
test_preds_all_features_squared = model_all_features_squared.predict(test_data[total_features])

train_r2_all_features_squared = r2_score(train_data['Next Month Return'], train_preds_all_features_squared)
test_r2_all_features_squared = r2_score(test_data['Next Month Return'], test_preds_all_features_squared)

print(f"R-squared for Train Data (All Features Squared): {train_r2_all_features_squared}")
print(f"R-squared for Test Data (All Features Squared): {test_r2_all_features_squared}")

R-squared for Train Data (All Features Squared): 1.0
R-squared for Test Data (All Features Squared): 1.0


# Problem 7

In [13]:
alpha_value = 0.01

# Train Ridge regression model
ridge_model = Ridge(alpha=alpha_value)
ridge_model.fit(train_data[total_features], train_data['Next Month Return'])
train_preds_ridge = ridge_model.predict(train_data[total_features])
test_preds_ridge = ridge_model.predict(test_data[total_features])
train_r2_ridge = r2_score(train_data['Next Month Return'], train_preds_ridge)
test_r2_ridge = r2_score(test_data['Next Month Return'], test_preds_ridge)

# Train LASSO regression model
lasso_model = Lasso(alpha=alpha_value)
lasso_model.fit(train_data[total_features], train_data['Next Month Return'])
train_preds_lasso = lasso_model.predict(train_data[total_features])
test_preds_lasso = lasso_model.predict(test_data[total_features])
train_r2_lasso = r2_score(train_data['Next Month Return'], train_preds_lasso)
test_r2_lasso = r2_score(test_data['Next Month Return'], test_preds_lasso)

print(f"R-squared for Train Data (Ridge): {train_r2_ridge}")
print(f"R-squared for Test Data (Ridge): {test_r2_ridge}")
print(f"R-squared for Train Data (LASSO): {train_r2_lasso}")
print(f"R-squared for Test Data (LASSO): {test_r2_lasso}")

R-squared for Train Data (Ridge): 1.0
R-squared for Test Data (Ridge): 1.0
R-squared for Train Data (LASSO): 1.0
R-squared for Test Data (LASSO): 1.0


  model = cd_fast.enet_coordinate_descent(
