In [27]:
# File System Manipulation Imports
import os
import sys
import fileinput
import re
import time

# Data Analysis Imports
import pandas as pd
import numpy as np

# Suppress Warnings
import warnings
warnings.filterwarnings("ignore")

In [28]:
def cleanEventCodesData(filename):
    input_file= open(filename, 'r')
    output_file_name = filename.split("/", 1)[-1]
    output_file = open(output_file_name, 'w')
    
    while True:
        line = input_file.readline()
        if not line: break
        line = re.sub('"', ',', line).rstrip() # Replace quotation marks with comma
        line = ",".join(line.split("\t", 1)) # Replace first instance of tab with comma
        line = "".join(line.split()) # Remove all whitespace

        # Replace multiple comma instances with single comma
        pattern = re.compile(r'(,){2,}')
        line = re.sub(pattern, ',', line)
        line = re.sub(',', ' ', line).strip()
        output_file.write(line + '\n')

event_codes_file = 'Original-Renamed/event-codes.txt'
lineup_file = 'Original-Renamed/lineup-sample.txt'
play_file = 'Original-Renamed/play-by-play-sample.txt'

cleanEventCodesData(event_codes_file)
event_codes_DF = pd.read_csv('event-codes.txt', sep=' ', engine='python')
event_codes_DF = event_codes_DF.sort_values(['Event_Msg_Type'])
event_codes_DF = event_codes_DF.set_index(['Event_Msg_Type', 'Action_Type'])
event_codes_DF.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Event_Msg_Type_Description,Action_Type_Description
Event_Msg_Type,Action_Type,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,MadeShot,NoShot
1,91,MadeShot,PutbackReverseDunkShot
1,92,MadeShot,PutbackSlamDunkShot
1,93,MadeShot,DrivingBankHookShot
1,94,MadeShot,JumpBankHookShot


In [42]:
# Free Throw Actions
sample = event_codes_DF.xs(0).sort_index()
sample

KeyError: 0

In [30]:
lineup_DF = pd.read_csv(lineup_file, sep=r'\s*', engine='python')
lineup_DF_rows = list(range(lineup_DF.shape[0]))
lineup_DF.reindex(lineup_DF_rows)
lineup_DF = lineup_DF.set_index(['Game_id', 'Period'])

print(lineup_DF.head().to_string())

# Check to see how many players registered per period for each game.
# There is consistently 10 players per period/quarter. This indicates the players starting on the court that quarter.
num_players = set()
for game_ids in lineup_DF.index.get_level_values('Game_id').unique():
    for period in range(4):
        num_players.add(lineup_DF.loc[game_ids, period+1].shape[0])
print(num_players)

                                                                Person_id                           Team_id status
Game_id                          Period                                                                           
021fd159b55773fba8157e2090fe0fe2 1       881f83d2dee3f18c7d1751659406144e  012059d397c0b7e5a30a5bb89c0b075e      A
                                 1       27ea17a8685c4919f157e83fe9cb2d9e  cff694c8186a4bd377de400e4f60fe47      A
                                 1       57bbd7e30bc694aeee9ee40c583e6811  cff694c8186a4bd377de400e4f60fe47      A
                                 1       cec898a1d355dbfbad8c760615fde1af  012059d397c0b7e5a30a5bb89c0b075e      A
                                 1       33963fe856a1523ff46438ba07d1d99f  cff694c8186a4bd377de400e4f60fe47      A
{10}


In [31]:
play_by_play_DF = pd.read_csv(play_file, sep=r'\s*', engine='python')
play_by_play_DF = play_by_play_DF.set_index(['Game_id', 'Period'])
play_by_play_DF.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Event_Num,Event_Msg_Type,WC_Time,PC_Time,Action_Type,Option1,Option2,Option3,Team_id,Person1,Person2,Team_id_type
Game_id,Period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
021fd159b55773fba8157e2090fe0fe2,1,0,12,546427,7200,0,0,0,0,1473d70e5646a26de3c52aa1abd85b1f,6bcf6c1f8c373d25fca1579bc4464a91,6bcf6c1f8c373d25fca1579bc4464a91,0
021fd159b55773fba8157e2090fe0fe2,1,1,10,546495,7200,0,0,0,0,012059d397c0b7e5a30a5bb89c0b075e,89706b99ddd00dc05d37ef5cafc04276,307beab25b1021a548b4a47550bc4b25,2
021fd159b55773fba8157e2090fe0fe2,1,2,2,546665,7050,1,3,0,0,012059d397c0b7e5a30a5bb89c0b075e,cec898a1d355dbfbad8c760615fde1af,6bcf6c1f8c373d25fca1579bc4464a91,2
021fd159b55773fba8157e2090fe0fe2,1,3,4,546714,6960,0,0,0,0,012059d397c0b7e5a30a5bb89c0b075e,307beab25b1021a548b4a47550bc4b25,6bcf6c1f8c373d25fca1579bc4464a91,2
021fd159b55773fba8157e2090fe0fe2,1,6,6,546886,6920,4,0,0,0,cff694c8186a4bd377de400e4f60fe47,c00264c3114d23bac482e9de50fb7d28,89706b99ddd00dc05d37ef5cafc04276,3


In [32]:
# Helper Methods

# Retrieves player given ID and dataframe
def getPlayer(player_id, plus_minus_DF):
    return plus_minus_DF.loc[plus_minus_DF['Person_id'] == player_id]

# Creates and adds new row containing player data to provided dataframe
def createPlayer(game_id, player_id, team_id, plus_minus_DF):
    player_row = pd.DataFrame({"Game_id": game_id, "Person_id": player_id, "Plus/Minus": 0, "Team_id": team_id, "Status": 'A', "Points": 0, "Rebounds": 0, "Assists": 0, "Turnovers": 0, "FGs Made": 0, "FGs Total": 0, "Fouls": 0, "FTs Made": 0, "FTs Total": 0}, index=[0])
    plus_minus_DF = plus_minus_DF.append(player_row)
    return plus_minus_DF

def printPlusMinus(plus_minus_DF):
    plus_minus_DF.drop('Team_id', axis=1, inplace=True)
    plus_minus_DF.drop('Game_id', axis=1, inplace=True)
    plus_minus_DF = plus_minus_DF.set_index(['Person_id'])
    print(plus_minus_DF.to_string())

def printResultDFStats(resultDF):
    stats = ["Plus/Minus", "Points", "Rebounds", "Assists", "Fouls", "Turnovers", "FTs Made", "FTs Total"]
    for stat in stats:
        print("Max ", stat, ": ", max(resultDF[stat]), "| Min ", stat, ": ", min(resultDF[stat]))

def checkSubFTEdgeCase(player_id, plus_minus_DF, row_number, miniFrame):
    print("To do")

In [53]:
# Evalutes individual game event and applies changes to +/- Dataframe
# @param game_play: Panda Series object containing the data concerning an individual play
# @param plus_minus_DF: +/- DataFrame to track player activity and data
def evaluateEvent(game_play, row_number, game_id, plus_minus_DF, miniFrame):
    event_type = game_play['Event_Msg_Type']
    action_type = game_play['Action_Type']
    player1_id = game_play['Person1']
    player2_id = game_play['Person2']
    team_id = game_play['Team_id']

    # Unrecognized event type
    if (event_type < 1 or event_type > 13):
        return plus_minus_DF
    event_code = event_codes_DF.loc[event_type, action_type]
    
    wantLog = False
    if (event_type == 1):
        if wantLog: print("Made Shot")
            
        # Iterate through all players registered in game
        for index, player in plus_minus_DF.iterrows():
            # If Team_id same as scorer and is active, +2, else if just active, -2
            if (player['Team_id'] == team_id) and (player['Status'] == 'A'):
                player['Plus/Minus'] += 2
            elif (player["Status"] == 'A'):
                player['Plus/Minus'] -= 2
            
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'Points'] += 2
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'FGs Made'] += 1
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'FGs Total'] += 1
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player2_id, 'Assists'] += 1
        
    elif (event_type == 2):
        if wantLog: print("Missed Shot")
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'FGs Total'] += 1
    elif (event_type == 3):
        if wantLog: print("Free Throw")
        
        for index, player in plus_minus_DF.iterrows():
            # If Team_id same as scorer and is active, +2, else if just active, -2
            if (player['Team_id'] == team_id) and (player['Status'] == 'A'):
                player['Plus/Minus'] += 1
            elif (player["Status"] == 'A'):
                player['Plus/Minus'] -= 1
        
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'Points'] += 1
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'FTs Made'] += 1
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'FTs Total'] += 1
            
    elif (event_type == 4):
        if wantLog: print("Rebound")
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'Rebounds'] += 1
    elif (event_type == 5):
        if wantLog: print("Turnover")
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'Turnovers'] += 1
    elif (event_type == 6):
        if wantLog: print("Foul")
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, 'Fouls'] += 1
    elif (event_type == 7):
        if wantLog: print("Violation")
    elif (event_type == 8):
        if wantLog: print("Substitution")
            
        # Set player 1 status to inactive
        plus_minus_DF.loc[plus_minus_DF['Person_id'] == player1_id, "Status"] = 'I'
        
        # If player 2 is registered, change status. Otherwise, create new row for that player
        if (player2_id in list(plus_minus_DF["Person_id"].values)):
            # Registered Player
            plus_minus_DF.loc[plus_minus_DF['Person_id'] == player2_id, "Status"] = 'A'
        else:
            # New Player
            plus_minus_DF = createPlayer(game_id, player2_id, team_id, plus_minus_DF)
            
    elif (event_type == 9):
        if wantLog: print("Timeout")
    elif (event_type == 10):
        if wantLog: print("Jump Ball")
    elif (event_type == 11):
        if wantLog: print("Ejection")
    elif (event_type == 12):
        if wantLog: print("Start Period")
    elif (event_type == 13):
        if wantLog: print("End Period")
    else:
        if wantLog: print("Event number not recognized: ", event_type)
    
    return plus_minus_DF

In [55]:
resultDF = pd.DataFrame(columns=["Game_id", "Person_id", "Plus/Minus", "Team_id", "Status", "Points", "Rebounds", "Assists", "Turnovers", "FGs Made", "FGs Total", "Fouls", "FTs Made", "FTs Total"])

test_count = 0
start = time.time()

for game_id in play_by_play_DF.index.get_level_values('Game_id').unique():
    # Track Player +/- DF throughout this specific game.
    plus_minus_DF = pd.DataFrame(columns=["Game_id", "Person_id", "Plus/Minus", "Team_id", "Status", "Points", "Rebounds", "Assists", "Turnovers", "FGs Made", "FGs Total", "Fouls", "FTs Made", "FTs Total"])
    
    for period in range(4):
        # Grab period/quarter-specific game log & organize data by time
        miniFrame = play_by_play_DF.loc[game_id, period+1]
        miniFrame = miniFrame.sort_values(['PC_Time'], ascending=False)
        row_number = 0
        
        # Populate Player +/- DF with starting line up if beginning of first period
        if (period == 0):
            starting_lineup = lineup_DF.loc[game_id, period+1]
            for index, player in starting_lineup.iterrows():
                plus_minus_DF = createPlayer(game_id, player['Person_id'], player['Team_id'], plus_minus_DF)
        
        for index, row in miniFrame.iterrows():
            plus_minus_DF = evaluateEvent(row, row_number, game_id, plus_minus_DF, miniFrame)
    
    # End of Game Box Score
    resultDF = resultDF.append(plus_minus_DF)
    
    test_count = test_count + 1
    
end = time.time()
print("Duration: ", (end - start), " seconds.")

printResultDFStats(resultDF)
print(resultDF.shape)
resultDF.to_csv("JohnYang_Q1_BBALL")
resultDF.head()

Game  0 :  021fd159b55773fba8157e2090fe0fe2
Game  1 :  03a31e84b194d6c8a2eab5d70ba67acf
Game  2 :  06bb1d31c63891e2580ff12e4e6505b4
Game  3 :  07e76f7482773e81e2351d1692e9e5bb
Game  4 :  0868dee930f69a54541d4ae88b841a37
Game  5 :  09d46e3d7a8253b7209100650b5afaeb
Game  6 :  13ced855d491384876c6ab807bd1d3db
Game  7 :  15d76177caa6022156e83774c2e054d3
Game  8 :  1eab6189ad9ab246c197575a8c4eebe5
Game  9 :  1f9e3cb05c031986cf8bc7c0a84cc517
Game  10 :  2bf4ac0ed9ac1aee8767134d62b34dfe
Game  11 :  2faabfa663f4dfb9ed83a1482088e092
Game  12 :  3152e9c330ce200cc189ae64ebdf41fc
Game  13 :  33f631fec90cc1f08bb16cff5ed52f9b
Game  14 :  345f285f7c654bf7a03e940a9bc923c3
Game  15 :  3b753670a0d0df2c35c7ce2e0bc94e6c
Game  16 :  56ccfaf0adead6f4c7236a01ca0cfbdc
Game  17 :  598a55c8bf052d039d0fb4ff1a62f98a
Game  18 :  7fad2269ee0d11ae5069ff23ecb25913
Game  19 :  87d1574f478f37d13789284b96b4f6cb
Game  20 :  88012a99d7fd1c169e2360aa5cdf8bfa
Game  21 :  896bcac9b0f35b250c9fb7a6325d8599
Game  22 :  8fd29433