In [2]:
from river import tree
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
df_notoh = pd.read_pickle("./data_frames/dense_acs_mm_notoh.pkl")
cat_cols_sig = ['OCCP', 'SCHL', 'ST', 'JWTRNS', 'DRAT', 'COW', 'SEX', \
       'RELSHIPP', 'POBP', 'ENG', 'MAR', 'RAC1P'] # significant features from the earlier analysis
numeric_cols = ['WKHP', 'AGEP', 'PINCP']
print(len(cat_cols_sig), len(numeric_cols))
from utils import ordinal_encoder
df_notoh = ordinal_encoder(df_notoh, cat_cols_sig)
from sklearn.model_selection import train_test_split
rand_seed = 21 
X_train, X_test, y_train, y_test = train_test_split(df_notoh.drop('PINCP', axis=1), 
                                                    df_notoh['PINCP'], test_size = 0.2,
                                                    random_state = rand_seed)

12 3


In [6]:
X_train_diclist = X_train.to_dict(orient='records')

In [8]:
X_train_diclist[0]

{'OCCP': 34.0,
 'WKHP': 0.07142857142857142,
 'AGEP': 0.4683544303797468,
 'SCHL': 19.0,
 'ST': 39.0,
 'JWTRNS': 11.0,
 'DRAT': 0.0,
 'COW': 0.0,
 'SEX': 1.0,
 'RELSHIPP': 5.0,
 'POBP': 32.0,
 'ENG': 1.0,
 'MAR': 4.0,
 'RAC1P': 1.0}

In [30]:
hoeftree = tree.HoeffdingTreeRegressor(max_depth = 30) # river paakacge

In [31]:
for t in tqdm(range(X_train.shape[0])):
    hoeftree.learn_one(X_train_diclist[t], y_train.iloc[t])

100%|██████████| 1255350/1255350 [03:39<00:00, 5720.80it/s]


In [32]:
y_pred_test = np.empty(X_test.shape[0])
for t in range(X_test.shape[0]):
    y_pred_test[t] = hoeftree.predict_one(X_test_diclist[t])

In [None]:
from sklearn.metrics import r2_score

In [23]:
r2_score(y_test, y_pred_test) # with max depth possible = 13

0.456130042186772

In [29]:
r2_score(y_test, y_pred_test) # with max depth possible = 20

0.48491506950844876

In [33]:
r2_score(y_test, y_pred_test) # with max depth possible = 30

0.48921635058878454

In [35]:
y_pred_train = np.empty(X_train.shape[0])
for t in range(X_train.shape[0]):
    y_pred_train[t] = hoeftree.predict_one(X_train_diclist[t])
r2_score(y_train, y_pred_train)

0.49508084441591205

# Using Sklearn multi flow hoeffding tree regressor

In [4]:
from skmultiflow.trees import HoeffdingTreeRegressor
htreg_skmult = HoeffdingTreeRegressor()

In [5]:
X_train_np = X_train.to_numpy()

In [6]:
y_train_np = y_train.to_numpy()

In [7]:
for t in tqdm(range(X_train.shape[0])):
    X_val = X_train_np[t].reshape(1, -1)
    #predict
    htreg_skmult.partial_fit(X_val, [y_train_np[t]])

100%|██████████| 1255350/1255350 [02:32<00:00, 8232.14it/s] 


In [9]:
y_pred_test_skflow = htreg_skmult.predict(X_test.to_numpy())
y_pred_train_skflow = htreg_skmult.predict(X_train.to_numpy())

In [11]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_test_skflow), r2_score(y_train, y_pred_train_skflow)

(0.4964391201517464, 0.5051641597192292)

# Comparing wrt linear regression, needs one hot

In [3]:
df_oh = pd.read_pickle("./data_frames/dense_acs_mm_oh.pkl")
# cat_cols_sig = ['OCCP', 'SCHL', 'ST', 'JWTRNS', 'DRAT', 'COW', 'SEX', \
#        'RELSHIPP', 'POBP', 'ENG', 'MAR', 'RAC1P'] # significant features from the earlier analysis
# numeric_cols = ['WKHP', 'AGEP', 'PINCP']
from sklearn.model_selection import train_test_split
rand_seed = 21
X_train, X_test, y_train, y_test = train_test_split(df_oh.drop('PINCP', axis=1), 
                                                    df_oh['PINCP'], test_size = 0.2,
                                                    random_state = rand_seed)

In [None]:
from river import linear_model
lr = linear_model.LinearRegression()

for t in tqdm(range(X_train.shape[0])):
    lr.learn_many(X_train.iloc[[t]], y_train.iloc[[t]])

 75%|███████▍  | 938399/1255350 [14:45<04:55, 1073.39it/s]

In [12]:
from sklearn.metrics import r2_score

y_pred_lr_train = lr.predict_many(X_train)
y_pred_lr_test = lr.predict_many(X_test)


r2_score(y_train, y_pred_lr_train), r2_score(y_test, y_pred_lr_test) # r2 score on train and test

(0.41386102353173837, 0.4148610401953663)

In [32]:
X_train.shape, X_test.shape

((1255350, 897), (313838, 897))

In [10]:
from river import linear_model
ridge = linear_model.LinearRegression(l2 = 1.0)

for t in tqdm(range(X_train.shape[0])):
    ridge.learn_many(X_train.iloc[[t]], y_train.iloc[[t]])

100%|██████████| 1255350/1255350 [22:07<00:00, 946.00it/s]


In [13]:
y_pred_ridge_train = ridge.predict_many(X_train)
y_pred_ridge_test  = ridge.predict_many(X_test)

r2_score(y_train, y_pred_ridge_train), r2_score(y_test, y_pred_ridge_test) # r2 score on train and test

(0.06503927738704895, 0.06628872961371568)

# R2 score of oridge saved models, not principled because uses whole data, 

but still r2 score poor on train and test, on shuffled with seed 21

In [12]:
import joblib
# oridge_imp = joblib.load('./onehot_encoded/models/oridge_implementable/allstates2021_20230724-220046_shuffled__undropped_.pkl') # bad for oridge r2

#Seed 42 shuffle
# oridge_imp = joblib.load('./onehot_encoded/models/oridge_implementable/allstate_shuffle_seed42_20230727-014004_shuffled__undropped_.pkl') #ok
# oridge_imp = joblib.load('./onehot_encoded/models/oridge_implementable/allstate_shuffle_seed42_20230727-031153_shuffled__dropped_.pkl') #ok

# RAC1P sorted
# oridge_imp = joblib.load('./onehot_encoded/models/oridge_implementable/sort_by_racecounts_20230726-003542_unshuffled__undropped_.pkl') #bad
# oridge_imp = joblib.load('./onehot_encoded/models/oridge_implementable/sort_by_racecounts_20230726-223352_unshuffled__dropped_.pkl') #vbad

#SEX sorted
# oridge_imp = joblib.load('./onehot_encoded/models/oridge_implementable/sort_by_sex_20230726-020646_unshuffled__undropped_.pkl') #ok
oridge_imp = joblib.load('./onehot_encoded/models/oridge_implementable/sort_by_sex_20230727-000414_unshuffled__dropped_.pkl')  #terrible

In [155]:
type(oridge_imp.model)

river.linear_model.lin_reg.LinearRegression

In [156]:
y_pred_origde_testall = oridge_imp.model.predict_many(X_test)
y_pred_origde_trainall = oridge_imp.model.predict_many(X_train)
# even after seing all data
r2_score(y_test, y_pred_origde_testall), r2_score(y_train, y_pred_origde_trainall)

(-0.09417574434969489, -0.09689756202858923)

In [158]:
len(oridge_imp.loss_tarr)

1569188

## r2 score of least squares model


In [148]:
import joblib
# best_ls_groups = joblib.load('./onehot_encoded/models/bestsqloss/allstates2021_20230724-220046_shuffled__undropped_.pkl')

#Seed 42 shuffle
# best_ls_groups = joblib.load('./onehot_encoded/models/bestsqloss/allstate_shuffle_seed42_20230727-014004_shuffled__undropped_.pkl') 
# best_ls_groups = joblib.load('./onehot_encoded/models/bestsqloss/allstate_shuffle_seed42_20230727-031153_shuffled__dropped_.pkl')

# RAC1P sorted
# best_ls_groups = joblib.load('./onehot_encoded/models/bestsqloss/sort_by_racecounts_20230726-003542_unshuffled__undropped_.pkl')
# best_ls_groups = joblib.load('./onehot_encoded/models/bestsqloss/sort_by_racecounts_20230726-223352_unshuffled__dropped_.pkl')

#SEX sorted
# best_ls_groups = joblib.load('./onehot_encoded/models/bestsqloss/sort_by_sex_20230726-020646_unshuffled__undropped_.pkl')
best_ls_groups = joblib.load('./onehot_encoded/models/bestsqloss/sort_by_sex_20230727-000414_unshuffled__dropped_.pkl')

In [149]:
len(best_ls_groups.experts)

12

In [150]:
best_ls_groups.loss_experts_arr[-1].shape, X_train.shape[0] + X_test.shape[0]

((1569188, 1), 1569188)

In [151]:
type(best_ls_groups.experts[-1])

river.linear_model.lin_reg.LinearRegression

In [152]:
y_pred_ls_testall = best_ls_groups.experts[-1].predict_many(X_test)
y_pred_ls_trainall = best_ls_groups.experts[-1].predict_many(X_train)

In [153]:
r2_score(y_test, y_pred_ls_testall), r2_score(y_train, y_pred_ls_trainall)

(0.42247855699411296, 0.4201866569567727)

# Takeaway:

- Online ridge vs Batch ridge big difference
- Online LS vs Batch LS still preserves r2 scores (upto a degree, doesnt become negative)
- Hoeffding tree even better

# MLP regressor on label encoded data

In [8]:
X_train.shape[1] * 0.6

538.1999999999999

In [15]:
from river import neural_net as nn
from river import optim
mlp_online = nn.MLPRegressor(
            hidden_dims=(500,),
            activations=(
            nn.activations.Identity,
            nn.activations.ReLU,
            nn.activations.Identity
        )
)

In [None]:
for t in tqdm(range(X_train.shape[0])):
    mlp_online.learn_many(X_train.iloc[[t]], y_train.iloc[[t]])

 98%|█████████▊| 1234292/1255350 [58:14<00:53, 390.15it/s] 

In [None]:
import pickle
with open('./onehot_encoded/models/mlp_reg/single_hidden500_mlp.pkl', 'wb') as f:
    pickle.dump(mlp_online, f)

In [21]:
import joblib
sk_mlp = joblib.load('./onehot_encoded/models/mlp_sklearnbatch/sksingle_hidden500_mlp.pkl')

In [23]:
X_test.shape

(313838, 897)

In [24]:
sk_mlp.predict(X_test)

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
