# Creating database and query sample

In [None]:
import pandas as pd
import numpy as np
import sqlite3

#creating the database and connect
conn = sqlite3.connect(":memory:")
c = conn.cursor()

# creating the tables
sql_create_tables = [
    """
    CREATE TABLE IF NOT EXISTS person (
    PersonId integer PRIMARY KEY,
    FullName text,
    BatSide text,
    PitchHand text
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS venue (
    VenueId integer PRIMARY KEY,
    Name text
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS event (
    EventId integer PRIMARY KEY,
    VenueId integer
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS play (
    WorkoutPitchId integer PRIMARY KEY,
    EventId integer,
    IsPitch integer,
    PersonId integer,
    BatterId integer,
    PitchSpeed integer,
    PitchTypeCode integer
    );
    """
                    ]
for i in sql_create_tables:
    c.execute(i)

sql_populate_tables = [
    """
    INSERT INTO person (PersonId,FullName,BatSide,PitchHand) VALUES (?,?,?,?);
    """,
    """
    INSERT INTO venue (VenueId, Name) VALUES (?, ?);
    """,
    """
    INSERT INTO event (EventId, VenueId) VALUES (?, ?)
    ;
    """,
    """
    INSERT INTO play (WorkoutPitchId, EventId, IsPitch, PersonId,
    BatterId, PitchSpeed, PitchTypeCode) VALUES (?,?,?,?,?,?,?)
    ;
    """]

persons = [
    [115614, "Gary H", "S", "R"],
    [641302, "Steve O", "L", "L"],
    [440981, "Scott L", "R", "R"],
    [443207, "Brian O", "R", "R"],
    [641548, "Leroy W", "R", "R"]
]

for p in persons:
    c.execute(sql_populate_tables[0], p)

venues = [
    [23, "Milwaukee County Stadium"],
    [2724, "Louisville Slugger Field"],
    [2761, "Wake Forest Baseball Park"],
    [2798, "Lake Olmstead Stadium"],
    [2835, "Valley Strong Ballpark"]
]

for v in venues:
    c.execute(sql_populate_tables[1], v)

events = [
    [656271, 2739],
    [668273, 3054],
    [675413, 23],
    [676569, 401],
    [631787, 2835]
]

for e in events:
    c.execute(sql_populate_tables[2], e)

plays = [
    [23, 675413,   1, 641302, 607609, 89.29, "FF"],
    [97, 675413,   1, 641302, 607609, 64.0, "SL"],
    [134, 675413,  1, 641302, 607609, 66.0, "SL"],
    [171, 631787,  1, 641548, 607609, 79.86, "FF"],
    [208, 631787,  1, 641548, 607609, 52.0, "CU"],
    [209, 631787,  1, 641548, 607609, 55.0, "CU"],
]

for p in plays:
    c.execute(sql_populate_tables[3], p)

In [None]:
# query data base to return a players average pitch speed for all their pitches
query = """
    SELECT
            person.FullName, play.PitchTypeCode, AVG(play.PitchSpeed)
          FROM play
              INNER JOIN person ON play.PersonId = person.PersonId
              INNER JOIN event ON play.EventId = event.EventId
              INNER JOIN venue ON event.VenueId = venue.VenueId
          WHERE
              venue.Name LIKE '%Stadium%'
          GROUP BY
              person.FullName, play.PitchTypeCode
    ;    
"""
pd.read_sql(query, con= conn)

## Generating fake data to demonstrate calculating player speed

In [None]:
data = [
    ['a', 0, 75, 57, 2],
    ['a', 0, 76, 57, 3],
    ['a', 0, 77, 57, 4],
    ['a', 0, 78, 57, 5],
    ['b', 1, 72.4, 57.39, 0.043],
    ['b', 1, 75.6, 58.12, 0.09],
    ['b', 1, 78.75, 59.1, 0.134]
    
]
'''data = [
    ['a', 0, 75.9444, 57.4922, 0.032],
    ['a', 0, 75.96, 57.512, 0.04],
    ['a', 0, 75.975, 57.521, 0.052],
    ['a', 0, 75.99, 57.43, 0.066],
    ['b', 1, 72.4, 57.39, 0.043],
    ['b', 1, 75.6, 58.12, 0.09],
    ['b', 1, 78.75, 59.1, 0.134]
    
]'''

df = pd.DataFrame(data, 
                  columns = ['playid','playerid','X_ft','Y_ft',
                             'time_sec'])


In [None]:
import math
import pandas as pd

# Reading a csv file - name is unknown
#df = pd.read_csv('filename.csv')   # file name unknown


# this will be used to calculate distance between 2 sets of coords
def distance(coordinates):
    """
    ingest a list of x, y coords
    calculates total distance
    """
    total_distance = 0

    if len(coordinates) <= 1:
        return 0
    
    else:
        for coords in range(len(coordinates) - 1):
            x1 = coordinates[coords][0]
            y1 = coordinates[coords][1]
            x2 = coordinates[coords + 1][0]
            y2 = coordinates[coords + 1][1]
            
            total_distance += math.dist([x1, y1], [x2, y2])
 
    return total_distance



#this will hold all of the data and be used to create the final df
stats = []

#retrieves all of the unique players in the data
players = df['playerid'].unique()
#loops through the players to process the calcultions
for player in players:
    #finds each unique play for the single player in the dataset
    plays = df[df['playerid']==player]['playid'].unique()
    
    #df filtered to a single player
    temp = df[df['playerid']==player]
    
    for play in plays:
        # filtering data to single player and single play
        temp = temp[temp['playid']==play]
    
        #calculates total distance traveled using custom function
        dist_lists = []
        for row in temp.itertuples():
            x = row[3]
            y = row[4]
            
            dist_lists.append([x, y])

        dist = distance(dist_lists)
        
        # retrieves the last entry for time since start of play on the players play
        play_time = temp.iloc[-1,4]
        
        # calculates avg speed for the play using distance and total time
        avg_speed = dist / play_time
        
        #appends the data to as a list to the stats list
        stats.append([player, play, dist, avg_speed])

        
# df that is grouped by player and play showing total distance and
# avg speed for that player on a single play
stats_df = pd.DataFrame(stats, columns=['playerid', 'playid', 'distance_ft', 'avg_ft_per_sec'])   

stats_df