## Randomized search for hyperparameters

  - RandomForest
  - GaussianNB
  - Logistic Regression

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn import compose
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
import numpy as np

### Get data

In [2]:
df = pd.read_csv('data/instacart.csv')

In [3]:
df = df.dropna() 

### Feature engineering / data processing

In [89]:
def data_clean(df):
    print('Getting dummies...')
    df = pd.get_dummies(df, columns=['department_id'], drop_first=True)
    
    # find total numbers of orders
#     print('Finding total number of orders...')
#     temp = df.groupby('user_id').agg({'order_number' : 'max'}).reset_index()
#     df = df.merge(temp, how='left', left_on='user_id', right_on='user_id')

    # weekend flag
    print('Creating weekend flag...')
    df['weekend'] = 0
    df.loc[(df['order_dow'] == 5) | (df['order_dow'] == 6),'weekend'] = 1
    
    # Department staples
    print('Encoding dept staples...')
    staples = {
        'dairy eggs' : 1,
        'produce' : 1,
        'pantry' : 0,
        'meat seafood' : 1,
        'bakery' : 0,
        'personal care' : 0,
        'snacks' : 0,
        'breakfast' : 0,
        'beverages' : 0,
        'deli' : 0,
        'household' : 0,
        'international' : 0,
        'dry goods pasta' : 1,
        'frozen' : 0,
        'canned goods' : 0,
        'babies' : 0,
        'pets' : 0,
        'alcohol' : 0,
        'bulk' : 0,
        'missing' : 0,
        'other' : 0
    }
    df.loc[:, 'department_is_staple'] = df['department'].apply(lambda x: staples[x])
    
    print('Encoding organic products...')
    df.loc[:, 'product_is_organic'] = 0
    df.loc[df.product_name.apply(lambda x: 'organic' in x.lower()), 'product_is_organic'] = 1
    
    print('Dropping columns...')
    df.drop(['product_name', 'aisle', 'department'], axis=1, inplace=True)
    
    print('Done.')
    
    return df

### Train-test split

Due to the huge number of rows (~37M), let's take a small subset of them just to test our randomized search code.

### Stratified shuffle split

Perform stratified train-test split of our dataset on a subset of the data.

**Subset data**

Randomly choose $n$ rows and perform feature eng and cleaning.

In [165]:
df_subset = df.sample(frac=0.008, replace=False, random_state=42)

In [166]:
X = data_clean(df_subset.drop(['reordered', 'eval_set'], axis=1))
y = df_subset['reordered']

Getting dummies...
Creating weekend flag...
Encoding dept staples...
Encoding organic products...
Dropping columns...
Done.


**Use stratified shuffling**

In [167]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
sss.get_n_splits(X, y)

1

In [168]:
for train_index, test_index in sss.split(X, y):
    print(len(train_index))
    print(len(test_index))

177749
76179


In [169]:
X_train = X.iloc[train_index, :]
y_train = y.iloc[train_index]
X_test = X.iloc[test_index, :]
y_test = y.iloc[test_index]

In [170]:
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

### RandomizedSearch on Logistic Regression

In [171]:
from sklearn.model_selection import RandomizedSearchCV

In [172]:
def make_random_cv(estimator, param_distr, cv=3, n_iter=5):
    """
    Steps:
    1. Define Estimator
    2. Define hyperparameter search space
    3. Instantiate RandomizedSearchCV
    """
    
    random_cv = RandomizedSearchCV(
                    estimator=estimator,
                    cv=cv,
                    param_distributions=param_distr,
                    n_iter=n_iter
                )
    
    return random_cv

In [176]:
rf_est = RandomForestClassifier(random_state=42)
param_distr = {
    'n_estimators' : [i * 50 for i in range(1, 5)],
    'max_features' : [i for i in range(1, 5)],
    'min_samples_leaf' : [100, 500, 1500, 5000]
}
rf_random_cv = make_random_cv(rf_est, param_distr)

**Fit and predict using the best estimator returned from randomsearch**

In [None]:
for i in range(3):
    rf_random_cv.fit(X_train, y_train)
    print(rf_random_cv.best_params_)
    y_pred = rf_random_cv.best_estimator_.predict(X_train)
    print(f1_score(y_train, y_pred, average='binary'))

{'n_estimators': 100, 'min_samples_leaf': 100, 'max_features': 4}
0.796953431838498
{'n_estimators': 100, 'min_samples_leaf': 500, 'max_features': 4}
0.7915235411075053


In [94]:
train_subset_size = 0.001
test_subset_size = 0.001

train_df = train_df.sample(frac=train_subset_size, replace=False, random_state=42)
test_df = test_df.sample(frac=test_subset_size, replace=False, random_state=42)

x_train = data_clean(train_df.drop(['reordered', 'eval_set'], axis=1))
x_test = data_clean(test_df.drop(['reordered', 'eval_set'], axis=1))

y_train = train_df['reordered']
y_test = test_df['reordered']

Getting dummies...
Creating weekend flag...
Encoding dept staples...
Encoding organic products...
Dropping columns...
Done.
Getting dummies...
Creating weekend flag...
Encoding dept staples...
Encoding organic products...
Dropping columns...
Done.
