In [1]:
# import modules
import pandas as pd
import numpy as np
import sklearn.linear_model as LinearRegression

In [37]:
def raw_event_to_proc_binary(event, aux_incl=False):
    """
    Given an event, this function returns a processed
    5160-tuple with the ith entry being 0 if that sensor_id
    was not pinged during this event, and a 1 otherwise
    """
    if aux_incl == False:
        event = event[event.auxiliary==False]
    
    # array to be returned
    proc = np.zeros((5160,))
    
    # find the sensors that got pinged, modify proc accordingly
    sensors = np.unique(event.sensor_id.values)
    for sensor in sensors:
        proc[sensor] = 1
    
    return proc

In [67]:
def raw_batch_to_proc_binary(batch, aux_incl=False):
    """
    Given a (sub)batch, this function returns a processed
    pandas DataFrame whose rows are the processed events
    according to raw_event_to_proc_binary
    """
    # DataFrame to be returned
    event_ids = np.unique(batch.index)
    df = pd.DataFrame(0, index=event_ids, columns=[i for i in range(5160)])

    # run the raw_event_to_proc_binary function on each event
    for event_id in event_ids:
        df.loc[event_id] = raw_event_to_proc_binary(batch.loc[event_id], aux_incl=aux_incl)
        
    return df

In [None]:
### MODEL TRAINING AND TESTING ### 

# Now we load our data and create various linear models 

In [5]:
sensor_geom = pd.read_csv('../sensor_geometry.csv')
batch10 = pd.read_parquet('../batches_train/batch_10.parquet') # CHANGE THIS TO BATCH 10 

In [38]:
event_ids = np.unique(batch10.index)

In [39]:
event_ids

array([29296372, 29296374, 29296414, ..., 32567659, 32567680, 32567683],
      dtype=int64)

In [6]:
# Train test split batch10, k-fold cross validation
# this cell imitates the erdos lectures notes on kfold cross validation , k = 5
# random seed to all splits random_seed = 134