In [2]:
from datetime import date
from pyspark.sql import Row


# get the files with the following sequence of unix commands:

# $ for yyyy in `seq 1910 10 2010`; do echo getting $yyyy; wget http://www.retrosheet.org/events/${yyyy}seve.zip; done
# $ for yyyy in `seq 1910 10 2010`; do mkdir ${yyyy}seve; done
# $ for yyyy in `seq 1910 10 2010`; do unzip -d ${yyyy}seve  ${yyyy}seve.zip; done





In [21]:
len("SFN200904070")

12

In [28]:

# The following are types for each record, 
# for each we explicitly cast to force the types in the Row                                                                                                                     

# sample line: id,SFN200904070         
def rowFromId(record):
    game_id = record[0]
    home_team = game_id[:3]
    yearStr = game_id[3:7]
    monthStr = game_id[7:9]
    dayStr = game_id[9:11]
    game_date = date(int(yearStr), int(monthStr), int(dayStr))
    game_date_seq = int(game_id[11:])
    return Row(
        game_id=game_id,
        home=home_team,
        game_date=game_date,
        game_date_seq=game_date_seq       
    )


# sample line: start,howar001,"Ryan Howard",0,4,3               
# sample line: sub,waldj001,"Jordan Walden",0,0,1
def rowFromStart(record):
    game_id = record[0]
    seq = record[1]
    fields = record[2]  
    return Row(
        game_id = game_id,
        seq = seq,
        player_id = fields[1],
        # skip denormalized name in fields[2]                                                                                                                    
        home = ("1" == fields[3]),
        battingOrder = fields[4],
        position = fields[5]
    )    

# sample line: play,6,1,bondb001,02,CFX,HR/9.3-H;2-H;1-H                                                                                                         
def rowFromPlay(record):
    game_id = record[0]
    seq = record[1]
    fields = record[2]
    countBalls = None
    countStrikes = None
    if (fields[4].isdigit()):
        countBalls = int(fields[4][0])
        countStrikes = int(fields[4][1])

    return Row(
        game_id = game_id,
        seq = seq,
        inning = int(fields[1]),
        topOfInning = ("0" == fields[2]),
        player_id = fields[3],
        countBalls = countBalls,
        countStrikes = countStrikes,
        pitch_seq = list(fields[5])
    )


# sample line: com,"$Career homer 587 to pass Frank Robinson for 4th all-time"
# or sample multi-line comment:
# 
# com,"$Hall caught in rundown while Winn advanced to 3B; both players"
# com,"ended up on 3B and Winn is tagged out; Hall thought he was the one"
# com,"who was out and stepped off the bag and is tagged out"
def rowFromCom(record):
    game_id = record[0]
    seq = record[1]
    fields = record[2] 
    return Row(
        game_id = game_id,
        seq = seq,
        comment = fields[1],
    ) 

# sample line: data,er,fyhrm001,0
def rowFromData(record):
    game_id = record[0]
    seq = record[1]
    fields = record[2] 
    return Row(
        game_id = game_id,
        seq = seq,
        type = fields[1],
        key = fields[2],
        value = fields[3]
    ) 

# sample line: info,site,SFO03
def rowFromInfo(record):
    game_id = record[0]
    seq = record[1]
    fields = record[2] 
    return Row(
        game_id = game_id,
        seq = seq,
        key = fields[1],
        value = fields[2],
    ) 


# sample line: badj,bonib001,R
# sample line: padj,harrg001,L
# sample line: ladj,0,9
def rowFromAdj(record):
    game_id = record[0]
    seq = record[1]
    fields = record[2] 
    return Row(
        game_id = game_id,
        seq = seq,
        adj = fields[0],
        key = fields[1],
        value = fields[2]
    ) 

## TODO consider making this a class, a pain to load in spark tho
PARSERS_FOR_TYPES = {
    'id': rowFromId,
    'start': rowFromStart,
    'sub': rowFromStart,
    'play': rowFromPlay,
    'com': rowFromCom,
    'data': rowFromData,
    'info': rowFromInfo,
    'padj': rowFromAdj,
    'badj': rowFromAdj,
    'ladj': rowFromAdj,
}

PARSERS_FOR_TYPES = {
    'info': rowFromInfo,
}


    
def getSupportedTypes():
    return PARSERS_FOR_TYPES.keys()
    
def wrappedParser(parser, record):
    returnVal = None
    try:
        returnVal = parser(record)
    except:
        print("WARN: ", parser, record) # TODO use logging.warn
        raise 
    return returnVal
    
def rowParserForType(rowType):    
    return lambda record: wrappedParser(PARSERS_FOR_TYPES[rowType], record)


getSupportedTypes()

['info']

In [29]:
retroFilePath = "retrosheet-data/2010seve/*.EV[AN]"

retrosheet = sc.newAPIHadoopFile( 
    retroFilePath, 
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 
    'org.apache.hadoop.io.LongWritable', 
    'org.apache.hadoop.io.Text', 
    conf={
        'textinputformat.record.delimiter':'\nid,'
    }
)

def processRecord( (k,v) ):
    events = v.splitlines()
    if (events[0].startswith("id")):
        game_id = events[0].split(",")[1]  
    else:
        game_id = events[0]   ## TODO figure out what's going on with these lines wo id
    return ((
        game_id,
        idx,
        record.split(",")
    ) for idx, record in enumerate(events))

# first do a flatmap to get the events between "id" records and to 
# process them into triples of game_id, event sequence, and the event fields
combinedFlattenedEvents = retrosheet.flatMap(processRecord)

# next create a dataframe for each row parser that we know about

dataFrameByType = {}
for recType in getSupportedTypes():
    curRdd = combinedFlattenedEvents.filter(
        lambda rec: rec[2][0] == recType
    ).map(rowParserForType(recType))
    dataFrameByType[recType] = sqlContext.createDataFrame(curRdd)
    # print(recType,dataFrameByType[recType].count())
    dataFrameByType[recType].printSchema()
    dataFrameByType[recType].show(10)

print("done")

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 94.0 failed 1 times, most recent failure: Lost task 0.0 in stage 94.0 (TID 3225, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/pyspark/rdd.py", line 1293, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-28-c8dfa07fc631>", line 152, in <lambda>
  File "<ipython-input-28-c8dfa07fc631>", line 145, in wrappedParser
  File "<ipython-input-28-c8dfa07fc631>", line 100, in rowFromInfo
IndexError: list index out of range

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:393)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor92.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/pyspark/rdd.py", line 1293, in takeUpToNumLeft
    yield next(iterator)
  File "<ipython-input-28-c8dfa07fc631>", line 152, in <lambda>
  File "<ipython-input-28-c8dfa07fc631>", line 145, in wrappedParser
  File "<ipython-input-28-c8dfa07fc631>", line 100, in rowFromInfo
IndexError: list index out of range

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
