In [1]:
import os
os.chdir('/home/megatron/work')

import megatron
import pandas as pd
import numpy as np
import sqlite3

In [2]:
# example using pandas
generator = False
lahman_file = 'data/lahman_csv/core/Batting.csv'
lahman = pd.read_csv(lahman_file)
exclude = ['playerID','yearID','stint','teamID','lgID']

if generator:
    lahman_generator = megatron.io.generator.PandasGenerator(lahman, 1000,
                                                             exclude_cols=exclude)
else:
    lahman_data = megatron.io.dataset.PandasData(lahman, exclude_cols=exclude)
    
inputs = megatron.nodes.from_dataframe(lahman, exclude_cols=exclude)

In [3]:
# example using csv
generator = True
lahman_file = 'data/lahman_csv/core/Batting.csv'
exclude = ['playerID','yearID','stint','teamID','lgID']

if generator:
    lahman_generator = megatron.io.generator.CSVGenerator(lahman_file, 1000,
                                                          exclude_cols=exclude)
else:
    lahman_data = megatron.io.dataset.CSVData(lahman_file, exclude_cols=exclude)
    
inputs = megatron.nodes.from_csv(lahman_file, exclude_cols=exclude, eager=True)

In [None]:
# example using sql database

# make sql database from csv
lahman_df = pd.read_csv('data/lahman_csv/core/Batting.csv')
conn = sqlite3.connect('lahman')
conn.execute('DROP TABLE IF EXISTS batting')
lahman_df.to_sql('batting', conn, index=False)

generator = True
conn = sqlite3.connect('lahman')
query = 'SELECT * FROM batting'

if generator:
    lahman_generator = megatron.io.generator.SQLGenerator(conn, query, 1000)
else:
    lahman_data = megatron.io.dataset.SQLData(conn, query)
    
inputs = megatron.nodes.from_sql(conn, query, eager=True)

In [5]:
fillna = megatron.layers.Impute({np.nan: 0})
inputs = fillna(inputs)
inputs = megatron.layers.Cast(np.int)(inputs)

# helpers
def single_fn(h, d, t, hr):
    return h - d - t - hr
singles = megatron.layers.Lambda(single_fn)(inputs[['H','2B','3B','HR']], 'Singles')
hit_types = megatron.layers.Concatenate()([singles]+inputs[['2B','3B','HR']], 'hit_types')
TB = megatron.layers.Dot(W=np.array([1,2,3,4]))(hit_types, 'TB')

# basics
PA = megatron.layers.Add()(inputs[['AB', 'BB', 'HBP', 'SH', 'SF']], 'PA')
BBp = megatron.layers.Divide()([inputs['BB'], PA], 'BBpct')
Kp = megatron.layers.Divide()([inputs['SO'], PA], 'Kpct')
def obp(h, bb, hbp, ab, sf):
    return megatron.helpers.safe_divide(h + bb + hbp, ab + bb + hbp + sf)
OBP = megatron.layers.Lambda(obp)(inputs[['H','BB','HBP','AB','SF']], 'OBP')
SLG = megatron.layers.Divide()([TB, inputs['AB']], 'SLG')
AVG = megatron.layers.Divide()(inputs[['H', 'AB']], 'AVG')
ISO = megatron.layers.Subtract()([SLG, AVG], 'ISO')
def babip(h, hr, ab, k, sf):
    return megatron.helpers.safe_divide(h - hr, ab - k - hr + sf)
BABIP = megatron.layers.Lambda(babip)(inputs[['H','HR','AB','SO','SF']], 'BABIP')

#outputs = [PA, BBp, Kp, OBP, SLG, AVG, ISO, BABIP]
outputs = inputs

#outputs = megatron.nodes.FeatureSet(outputs)
#outputs = megatron.layers.Lambda(np.round, decimals=2)(outputs)

P = megatron.Pipeline(inputs, outputs)

In [6]:
if generator:
    P.fit_generator(lahman_generator)
    out = P.transform_generator(lahman_generator, out_type='array')
else:
    P.fit(lahman_data)
    out = P.transform(lahman_data)

In [7]:
stored = P.storage.read(lookup_vals='0')

In [4]:
P.storage.delete_table()

In [9]:
stored.shape

(1, 17)

In [None]:
megatron.visuals.pipeline_imsave(P, 'img/sabermetrics.png')
megatron.visuals.pipeline_imshow(P)