In [1]:
import pyspark # 3.2.0
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, desc, col, \
session_window, when
from pyspark.sql.types import StringType, TimestampType, \
StructType, StructField

import psutil # 5.6.3
import platform
import os
import multiprocessing

from time import asctime

import statsapi 

In [2]:
MAX_MEMORY = '15G'
sesh_duration = "20 minutes"

print(f"approximate start time : {asctime()}")

approximate start time : Thu May 12 22:03:35 2022


In [3]:
conf = pyspark.SparkConf().setMaster("local[*]") \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set("spark.core.connection.ack.wait.timeout", "3600") \
        .set("spark.executor.memory", MAX_MEMORY) \
        .set("spark.driver.memory", MAX_MEMORY)

In [4]:
%%time
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/12 22:03:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


CPU times: user 28.6 ms, sys: 23.3 ms, total: 52 ms
Wall time: 4.83 s


In [5]:
print("="*40, "System Information", "="*40)
uname = platform.uname()
print(f"System: {uname.system}")
print(f"Release: {uname.release}")
print(f"Version: {uname.version}")
print(f"Machine: {uname.machine}")
print(f"Processor: {uname.processor}")
print(f"Platform: {platform.platform()}")
print(f"RAM: {str(round(psutil.virtual_memory().total / (1024.0 ** 3)))} GB")
print(f"Cores: {multiprocessing.cpu_count()}")

print("="*40, "Java, Python, Spark Versions", "="*30)

# java version
!java -version

# python version
print("-"*5)
!python --version

# spark version
print("-"*5)
print("spark version {}".format(sc.version))

System: Darwin
Release: 21.4.0
Version: Darwin Kernel Version 21.4.0: Fri Mar 18 00:45:05 PDT 2022; root:xnu-8020.101.4~15/RELEASE_X86_64
Machine: x86_64
Processor: i386
Platform: Darwin-21.4.0-x86_64-i386-64bit
RAM: 16 GB
Cores: 4
java version "1.8.0_311"
Java(TM) SE Runtime Environment (build 1.8.0_311-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.311-b11, mixed mode)
-----
Python 3.7.4
-----
spark version 3.2.0


In [6]:
%%time
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("baseball_is_not_boring") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs


In [7]:
%%time
schemer = StructType([StructField('name', StringType(), True),
                     StructField('name', TimestampType(), True)
                     ])

CPU times: user 69 µs, sys: 58 µs, total: 127 µs
Wall time: 136 µs


In [8]:
len(statsapi.schedule())

9

In [9]:
statsapi.schedule()[0]

{'game_id': 662548,
 'game_datetime': '2022-05-12T17:05:00Z',
 'game_date': '2022-05-12',
 'game_type': 'R',
 'status': 'Final',
 'away_name': 'New York Mets',
 'home_name': 'Washington Nationals',
 'away_id': 121,
 'home_id': 120,
 'doubleheader': 'N',
 'game_num': 1,
 'home_probable_pitcher': 'Joan Adon',
 'away_probable_pitcher': 'Taijuan Walker',
 'home_pitcher_note': '',
 'away_pitcher_note': '',
 'away_score': 4,
 'home_score': 1,
 'current_inning': 9,
 'inning_state': 'Bottom',
 'venue_id': 3309,
 'venue_name': 'Nationals Park',
 'winning_team': 'New York Mets',
 'losing_team': 'Washington Nationals',
 'winning_pitcher': 'Taijuan Walker',
 'losing_pitcher': 'Joan Adon',
 'save_pitcher': None,
 'summary': '2022-05-12 - New York Mets (4) @ Washington Nationals (1) (Final)'}

In [10]:
statsapi.schedule()[-1]

{'game_id': 662706,
 'game_datetime': '2022-05-13T02:10:00Z',
 'game_date': '2022-05-12',
 'game_type': 'R',
 'status': 'In Progress',
 'away_name': 'Philadelphia Phillies',
 'home_name': 'Los Angeles Dodgers',
 'away_id': 143,
 'home_id': 119,
 'doubleheader': 'N',
 'game_num': 1,
 'home_probable_pitcher': 'Tyler Anderson',
 'away_probable_pitcher': 'Zack Wheeler',
 'home_pitcher_note': '',
 'away_pitcher_note': '',
 'away_score': 4,
 'home_score': 1,
 'current_inning': 4,
 'inning_state': 'Top',
 'venue_id': 22,
 'venue_name': 'Dodger Stadium',
 'summary': '2022-05-12 - Philadelphia Phillies (4) @ Los Angeles Dodgers (1) (Top of the 4th)'}

In [11]:
player = 'Anthony Gose'

In [12]:
data = statsapi.lookup_player(player)

In [13]:
data[0]

{'id': 543238,
 'fullName': 'Anthony Gose',
 'firstName': 'Anthony',
 'lastName': 'Gose',
 'primaryNumber': '26',
 'currentTeam': {'id': 114},
 'primaryPosition': {'code': '1', 'abbreviation': 'P'},
 'useName': 'Anthony',
 'boxscoreName': 'Gose',
 'mlbDebutDate': '2012-07-17',
 'nameFirstLast': 'Anthony Gose',
 'firstLastName': 'Anthony Gose',
 'lastFirstName': 'Gose, Anthony',
 'lastInitName': 'Gose, A',
 'initLastName': 'A Gose',
 'fullFMLName': 'Anthony Robert Gose',
 'fullLFMName': 'Gose, Anthony Robert'}

In [14]:
pid = statsapi.lookup_player(player)[0]['id']
statsapi.player_stat_data(pid)

{'id': 543238,
 'first_name': 'Anthony',
 'last_name': 'Gose',
 'active': True,
 'current_team': 'Cleveland Guardians',
 'position': 'P',
 'nickname': None,
 'last_played': None,
 'mlb_debut': '2012-07-17',
 'bat_side': 'Left',
 'pitch_hand': 'Left',
 'stats': [{'type': 'season',
   'group': 'fielding',
   'season': '2022',
   'stats': {'assists': 0,
    'putOuts': 1,
    'errors': 0,
    'chances': 1,
    'fielding': '1.000',
    'position': {'code': '1',
     'name': 'Pitcher',
     'type': 'Pitcher',
     'abbreviation': 'P'},
    'rangeFactorPerGame': '0.10',
    'rangeFactorPer9Inn': '1.00',
    'innings': '9.1',
    'games': 10,
    'gamesStarted': 0,
    'doublePlays': 0,
    'triplePlays': 0,
    'throwingErrors': 0}},
  {'type': 'season',
   'group': 'pitching',
   'season': '2022',
   'stats': {'gamesPlayed': 10,
    'gamesStarted': 0,
    'groundOuts': 5,
    'airOuts': 9,
    'runs': 7,
    'doubles': 4,
    'triples': 0,
    'homeRuns': 2,
    'strikeOuts': 13,
    'baseOn

In [15]:
"""
what are the batting averages for some of the top players top player 
per team or top 3 players from dodgers, cubs, sox, angels, and a's
"""

"\nwhat are the batting averages for some of the top players top player \nper team or top 3 players from dodgers, cubs, sox, angels, and a's\n"

In [16]:
team_idxs = [] #wth
teams = []

for idx in range(0, 1000):
    try:
        idx = str(idx).zfill(3)
        data = statsapi.lookup_team(idx)[0]
        name = data['name'].lower()
        location = data['locationName'].lower()
        if 'dogder' in name or 'angel' in name or 'cub' in name or 'sox' in name or 'oakland' in location:
            teams.append(data)
        team_idxs.append(idx)
        if len(teams) == 6:
            break
    except Exception as exp:
        None

In [17]:
teams

[{'id': 108,
  'name': 'Los Angeles Angels',
  'teamCode': 'ana',
  'fileCode': 'ana',
  'teamName': 'Angels',
  'locationName': 'Anaheim',
  'shortName': 'LA Angels'},
 {'id': 111,
  'name': 'Boston Red Sox',
  'teamCode': 'bos',
  'fileCode': 'bos',
  'teamName': 'Red Sox',
  'locationName': 'Boston',
  'shortName': 'Boston'},
 {'id': 112,
  'name': 'Chicago Cubs',
  'teamCode': 'chn',
  'fileCode': 'chc',
  'teamName': 'Cubs',
  'locationName': 'Chicago',
  'shortName': 'Chi Cubs'},
 {'id': 119,
  'name': 'Los Angeles Dodgers',
  'teamCode': 'lan',
  'fileCode': 'la',
  'teamName': 'Dodgers',
  'locationName': 'Los Angeles',
  'shortName': 'LA Dodgers'},
 {'id': 133,
  'name': 'Oakland Athletics',
  'teamCode': 'oak',
  'fileCode': 'oak',
  'teamName': 'Athletics',
  'locationName': 'Oakland',
  'shortName': 'Oakland'},
 {'id': 145,
  'name': 'Chicago White Sox',
  'teamCode': 'cha',
  'fileCode': 'cws',
  'teamName': 'White Sox',
  'locationName': 'Chicago',
  'shortName': 'Chi Whi

In [18]:
statsapi.team_leaders(114, 'avg')

'Rank Name                 Value\n 1   Owen Miller          .333 \n 2   Steven Kwan          .298 \n 3   Jose Ramirez         .297 \n 4   Myles Straw          .248 \n 5   Amed Rosario         .240 \n 6   Franmil Reyes        .210 \n'

In [19]:
def get_tings(team_id, scategory):
    """
    it's ugly but it worx
    """
    leaders = statsapi.team_leaders(team_id, scategory)

    data = ["".join(item.split("\n")).strip(" ") for item in leaders.split("  ") if item != ""][2:]
    data

    idx = 0
    tings = {}
    while idx != len(data):
        tings[data[idx]] = data[idx + 1]
        idx += 2

    new_tings = {}

    for whatever in tings.items():
        new_tings[whatever[0]] = whatever[1].split(" ")[0]
        
    return new_tings

In [21]:
for team in teams:
    team_id = team['id']
    team['stolenBase_leaders'] = get_tings(team_id, 'stolenBases')
    team['avg_leaders'] = get_tings(team_id, 'avg')

IndexError: list index out of range

In [None]:
teams