In [23]:
import pyspark # 3.2.0
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, desc, col, \
session_window, when
from pyspark.sql.types import StringType, TimestampType, \
StructType, StructField

import psutil # 5.6.3
import platform
import os
import multiprocessing

from time import asctime

import statsapi 

import boto3

In [2]:
MAX_MEMORY = '15G'
sesh_duration = "20 minutes"

print(f"approximate start time : {asctime()}")

approximate start time : Sat May 21 20:55:33 2022


In [3]:
conf = pyspark.SparkConf().setMaster("local[*]") \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set("spark.core.connection.ack.wait.timeout", "3600") \
        .set("spark.executor.memory", MAX_MEMORY) \
        .set("spark.driver.memory", MAX_MEMORY)

In [4]:
%%time
sc = SparkContext(conf=conf)
sc.setLogLevel('WARN')

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/21 20:55:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


CPU times: user 35.1 ms, sys: 31.4 ms, total: 66.5 ms
Wall time: 7.81 s


In [5]:
print("="*40, "System Information", "="*40)
uname = platform.uname()
print(f"System: {uname.system}")
print(f"Release: {uname.release}")
print(f"Version: {uname.version}")
print(f"Machine: {uname.machine}")
print(f"Processor: {uname.processor}")
print(f"Platform: {platform.platform()}")
print(f"RAM: {str(round(psutil.virtual_memory().total / (1024.0 ** 3)))} GB")
print(f"Cores: {multiprocessing.cpu_count()}")

print("="*40, "Java, Python, Spark Versions", "="*30)

# java version
!java -version

# python version
print("-"*5)
!python --version

# spark version
print("-"*5)
print("spark version {}".format(sc.version))

System: Darwin
Release: 21.4.0
Version: Darwin Kernel Version 21.4.0: Fri Mar 18 00:45:05 PDT 2022; root:xnu-8020.101.4~15/RELEASE_X86_64
Machine: x86_64
Processor: i386
Platform: Darwin-21.4.0-x86_64-i386-64bit
RAM: 16 GB
Cores: 4
java version "1.8.0_311"
Java(TM) SE Runtime Environment (build 1.8.0_311-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.311-b11, mixed mode)
-----
Python 3.7.4
-----
spark version 3.2.0


In [6]:
%%time
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("baseball_is_not_boring") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 9.06 µs


In [7]:
%%time
schemer = StructType([StructField('name', StringType(), True),
                     StructField('name', TimestampType(), True)
                     ])

CPU times: user 57 µs, sys: 66 µs, total: 123 µs
Wall time: 129 µs


In [8]:
len(statsapi.schedule())

16

In [9]:
statsapi.schedule()[0]

{'game_id': 661289,
 'game_datetime': '2022-05-21T17:05:00Z',
 'game_date': '2022-05-21',
 'game_type': 'R',
 'status': 'Final',
 'away_name': 'Chicago White Sox',
 'home_name': 'New York Yankees',
 'away_id': 145,
 'home_id': 147,
 'doubleheader': 'N',
 'game_num': 1,
 'home_probable_pitcher': 'Nestor Cortes',
 'away_probable_pitcher': 'Dallas Keuchel',
 'home_pitcher_note': '',
 'away_pitcher_note': '',
 'away_score': 5,
 'home_score': 7,
 'current_inning': 9,
 'inning_state': 'Top',
 'venue_id': 3313,
 'venue_name': 'Yankee Stadium',
 'winning_team': 'New York Yankees',
 'losing_team': 'Chicago White Sox',
 'winning_pitcher': 'Nestor Cortes',
 'losing_pitcher': 'Dallas Keuchel',
 'save_pitcher': 'Clay Holmes',
 'summary': '2022-05-21 - Chicago White Sox (5) @ New York Yankees (7) (Final)'}

In [10]:
statsapi.schedule()[-1]

{'game_id': 663123,
 'game_datetime': '2022-05-21T22:10:00Z',
 'game_date': '2022-05-21',
 'game_type': 'R',
 'status': 'Postponed',
 'away_name': 'Detroit Tigers',
 'home_name': 'Cleveland Guardians',
 'away_id': 116,
 'home_id': 114,
 'doubleheader': 'N',
 'game_num': 1,
 'home_probable_pitcher': 'Shane Bieber',
 'away_probable_pitcher': 'Alex Faedo',
 'home_pitcher_note': '',
 'away_pitcher_note': '',
 'away_score': '0',
 'home_score': '0',
 'current_inning': '',
 'inning_state': '',
 'venue_id': 5,
 'venue_name': 'Progressive Field',
 'summary': '2022-05-21 - Detroit Tigers @ Cleveland Guardians (Postponed)'}

In [11]:
player = 'Anthony Gose'

In [12]:
data = statsapi.lookup_player(player)

In [13]:
data[0]

{'id': 543238,
 'fullName': 'Anthony Gose',
 'firstName': 'Anthony',
 'lastName': 'Gose',
 'primaryNumber': '26',
 'currentTeam': {'id': 114},
 'primaryPosition': {'code': '1', 'abbreviation': 'P'},
 'useName': 'Anthony',
 'boxscoreName': 'Gose',
 'mlbDebutDate': '2012-07-17',
 'nameFirstLast': 'Anthony Gose',
 'firstLastName': 'Anthony Gose',
 'lastFirstName': 'Gose, Anthony',
 'lastInitName': 'Gose, A',
 'initLastName': 'A Gose',
 'fullFMLName': 'Anthony Robert Gose',
 'fullLFMName': 'Gose, Anthony Robert'}

In [14]:
pid = statsapi.lookup_player(player)[0]['id']
statsapi.player_stat_data(pid)

{'id': 543238,
 'first_name': 'Anthony',
 'last_name': 'Gose',
 'active': True,
 'current_team': 'Cleveland Guardians',
 'position': 'P',
 'nickname': None,
 'last_played': None,
 'mlb_debut': '2012-07-17',
 'bat_side': 'Left',
 'pitch_hand': 'Left',
 'stats': [{'type': 'season',
   'group': 'pitching',
   'season': '2022',
   'stats': {'gamesPlayed': 11,
    'gamesStarted': 0,
    'groundOuts': 5,
    'airOuts': 11,
    'runs': 7,
    'doubles': 4,
    'triples': 0,
    'homeRuns': 2,
    'strikeOuts': 14,
    'baseOnBalls': 6,
    'intentionalWalks': 0,
    'hits': 8,
    'hitByPitch': 1,
    'avg': '.216',
    'atBats': 37,
    'obp': '.333',
    'slg': '.486',
    'ops': '.819',
    'caughtStealing': 0,
    'stolenBases': 0,
    'stolenBasePercentage': '.---',
    'groundIntoDoublePlay': 1,
    'numberOfPitches': 188,
    'era': '5.23',
    'inningsPitched': '10.1',
    'wins': 1,
    'losses': 0,
    'saves': 0,
    'saveOpportunities': 1,
    'holds': 0,
    'blownSaves': 1,
    

In [15]:
"""
what are the batting averages for some of the top players top player 
per team or top 3 players from dodgers, cubs, sox, angels, and a's
"""

"\nwhat are the batting averages for some of the top players top player \nper team or top 3 players from dodgers, cubs, sox, angels, and a's\n"

In [16]:
team_idxs = [] #wth
teams = []

for idx in range(0, 1000):
    try:
        idx = str(idx).zfill(3)
        data = statsapi.lookup_team(idx)[0]
        name = data['name'].lower()
        location = data['locationName'].lower()
        if 'dogder' in name or 'angel' in name or 'cub' in name or 'sox' in name or 'oakland' in location:
            teams.append(data)
        team_idxs.append(idx)
        if len(teams) == 6:
            break
    except Exception as exp:
        None

In [17]:
teams

[{'id': 108,
  'name': 'Los Angeles Angels',
  'teamCode': 'ana',
  'fileCode': 'ana',
  'teamName': 'Angels',
  'locationName': 'Anaheim',
  'shortName': 'LA Angels'},
 {'id': 111,
  'name': 'Boston Red Sox',
  'teamCode': 'bos',
  'fileCode': 'bos',
  'teamName': 'Red Sox',
  'locationName': 'Boston',
  'shortName': 'Boston'},
 {'id': 112,
  'name': 'Chicago Cubs',
  'teamCode': 'chn',
  'fileCode': 'chc',
  'teamName': 'Cubs',
  'locationName': 'Chicago',
  'shortName': 'Chi Cubs'},
 {'id': 119,
  'name': 'Los Angeles Dodgers',
  'teamCode': 'lan',
  'fileCode': 'la',
  'teamName': 'Dodgers',
  'locationName': 'Los Angeles',
  'shortName': 'LA Dodgers'},
 {'id': 133,
  'name': 'Oakland Athletics',
  'teamCode': 'oak',
  'fileCode': 'oak',
  'teamName': 'Athletics',
  'locationName': 'Oakland',
  'shortName': 'Oakland'},
 {'id': 145,
  'name': 'Chicago White Sox',
  'teamCode': 'cha',
  'fileCode': 'cws',
  'teamName': 'White Sox',
  'locationName': 'Chicago',
  'shortName': 'Chi Whi

In [18]:
statsapi.team_leaders(114, 'avg')

'Rank Name                 Value\n 1   Jose Ramirez         .287 \n 2   Owen Miller          .286 \n 3   Steven Kwan          .265 \n 4   Myles Straw          .254 \n 5   Amed Rosario         .242 \n 6   Franmil Reyes        .205 \n'

In [19]:
def get_tings(team_id, scategory):
    """
    it's ugly but it worx
    """
    leaders = statsapi.team_leaders(team_id, scategory)

    data = ["".join(item.split("\n")).strip(" ") for item in leaders.split("  ") if item != ""][2:]
    data

    idx = 0
    tings = {}
    while idx != len(data):
        tings[data[idx]] = data[idx + 1]
        idx += 2

    new_tings = {}

    for whatever in tings.items():
        new_tings[whatever[0]] = whatever[1].split(" ")[0]
        
    return new_tings

In [79]:
def intellect(team_id):
    """
    "Team": "cubs"
    "batting average leaders": {
    }
    
    }

    """
    results = statsapi.team_leaders(team_id, 'stolenBases')
    cleaner_results = [item.replace("\n", "").strip(" ") for item in results.split("  ") if item != ""]
    cleaner_results = cleaner_results[1:len(cleaner_results)-1]
    cleaner_results[0] = 1
    idx = 0
    sB_leaders = {}
    while idx < len(cleaner_results):
        name = cleaner_results[idx + 1]
        value = cleaner_results[idx + 2]
        sB_leaders[name] = value
        idx += 3
    return sB_leaders
    

In [80]:
for team in teams:
    team_id = team['id']
    team['stolenBase_leaders'] = intellect(team_id)
    team['avg_leaders'] = get_tings(team_id, 'avg')

In [81]:
teams

[{'id': 108,
  'name': 'Los Angeles Angels',
  'teamCode': 'ana',
  'fileCode': 'ana',
  'teamName': 'Angels',
  'locationName': 'Anaheim',
  'shortName': 'LA Angels',
  'avg_leaders': {'Taylor Ward': '.370',
   'Mike Trout': '.313',
   'Brandon Marsh': '.270',
   'Shohei Ohtani': '.252',
   'Anthony Rendon': '.241',
   'Jared Walsh': '.239'},
  'stolenBase_leaders': {'Shohei Ohtani': '6',
   'Andrew Velazquez': '6',
   'Brandon Marsh': '3',
   'Tyler Wade': '3',
   'Anthony Rendon': '2',
   'Jo Adell': '1',
   'Jack Mayfield': '1',
   'Taylor Ward': '1'}},
 {'id': 111,
  'name': 'Boston Red Sox',
  'teamCode': 'bos',
  'fileCode': 'bos',
  'teamName': 'Red Sox',
  'locationName': 'Boston',
  'shortName': 'Boston',
  'avg_leaders': {'J.D. Martinez': '.349',
   'Rafael Devers': '.337',
   'Xander Bogaerts': '.331',
   'Trevor Story': '.226',
   'Alex Verdugo': '.214',
   'Jackie Bradley Jr.': '.193',
   'Enrique Hernandez': '.188'},
  'stolenBase_leaders': {'Trevor Story': '5',
   'Xand

In [26]:
s3 = boto3.resource('s3')

In [28]:
s3.create_bucket(Bucket='the-bucket-with-the-things', CreateBucketConfiguration={
    'LocationConstraint': 'us-west-1'})

s3.Bucket(name='the-bucket-with-the-things')

In [34]:
s3 = boto3.client('s3')
response = s3.list_buckets()

In [35]:
# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  logs.radchicago.com
  radchicago.com
  the-bucket-with-the-things
  www.radchicago.com


In [91]:
#Creating S3 Resource From the Session.

session = boto3.Session()

s3 = session.resource('s3')

ts = asctime().replace(" ", "")

file_name = f"stats_{ts}.txt"

object = s3.Object('the-bucket-with-the-things', file_name)

result = object.put(Body=str(teams))

res = result.get('ResponseMetadata')

if res.get('HTTPStatusCode') == 200:
    print('File Uploaded Successfully')
else:
    print('File Not Uploaded')

File Uploaded Successfully
