In [1]:
import mysql.connector
from dotenv import load_dotenv
from datetime import datetime
import pandas as pd
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

pd.set_option('precision', 2)

In [2]:
def connect_to_SQL():
    load_dotenv()
    username, password = os.getenv("USERNAME"), os.getenv("PASSWORD")
    conn = mysql.connector.connect(user=os.getenv("USERNAME"), password=os.getenv("PASSWORD"),
                                   host='127.0.0.1')
    engine = create_engine(f'mysql+pymysql://{username}:{password}@localhost/baseballStats_db')
    cursor = conn.cursor()
    return cursor, conn, engine

In [3]:
cursor, conn, engine = connect_to_SQL()
connection = engine.connect()

In [12]:
career_pitching_df = pd.read_sql('CareerPitchingStats', engine, index_col = ['PlayerID'])
player_bios = pd.read_sql('PlayerBios', engine, index_col='PlayerID')
player_names = pd.read_sql('PlayerNames', engine, index_col='PlayerID')
hall_of_fame = pd.read_sql('HallOfFame', engine, index_col = 'PlayerID')

In [16]:
career_pitching_df = career_pitching_stats.join(player_bios, on = ['PlayerID'])
career_pitching_df = career_pitching_df.join(player_names, on = ['PlayerID'])
career_pitching_df = career_pitching_df.join(hall_of_fame, on = ['PlayerID'])

# fill in NaN values for HOF
career_pitching_df = career_pitching_df.fillna(0)
career_pitching_df

Unnamed: 0_level_0,Games,GamesStarted,CompleteGames,Shutouts,GamesFinished,Saves,InningsPitched,Hits,BFP,Homeruns,...,debutDate,finalGameDate,bats,throws,CareerLength_Years,MonthsExtra,birthState,birthCountry,PlayerName,YearOfInduction
PlayerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aardd001,331,0,0,0,141,69,337,296,1475,41,...,2004-04-06,2015-08-23,R,R,12,4,Colorado,USA,David Aardsma,0.0
aased001,448,91,22,5,235,82,1109,1085,4730,89,...,1977-07-26,1990-10-03,R,R,14,2,California,USA,Don Aase,0.0
abadf001,384,6,0,0,97,2,330,309,1399,42,...,2010-07-28,2019-09-28,L,L,10,2,La Romana,Dominican Republic,Fernando Abad,0.0
abbog001,248,206,37,5,13,0,1286,1405,5508,162,...,1973-07-29,1984-08-08,R,R,11,0,Arkansas,USA,Glenn Abbott,0.0
abboj001,263,254,31,6,5,0,1674,1779,7211,154,...,1989-04-08,1999-07-21,L,L,11,3,Michigan,USA,Jim Abbott,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zimmk001,31,1,0,0,7,0,41,42,193,2,...,2019-03-31,2020-09-22,R,R,2,5,California,USA,Kyle Zimmer,0.0
zitob001,433,421,12,5,9,0,2576,2381,11001,282,...,2000-07-22,2015-09-30,L,L,16,2,Nevada,USA,Barry Zito,0.0
zumaj001,171,0,0,0,35,5,209,169,911,18,...,2006-04-03,2010-06-28,R,R,5,2,California,USA,Joel Zumaya,0.0
zuveg101,265,31,9,2,139,40,642,660,2746,56,...,1951-04-21,1959-06-15,R,R,9,1,Michigan,USA,George Zuverink,0.0


#### Average Playing Career Length For Pitchers

In [19]:
print('Average Career Length for Pitchers in our database was {:.2f} years'.format(career_pitching_df['CareerLength_Years'].mean()))

Average Career Length for Pitchers in our database was 6.79 years


#### Difference in dominant throwing hand between pitchers

In [29]:
dominant_hand_stats = dict(career_pitching_df['throws'].value_counts())

dominant_hand_dict = {'R': 'Righty', 'L': 'Lefty', 'B': 'Both'}

for key, value in dominant_hand_stats.items():
    print(str(dominant_hand_dict[key]) +':' ,value)

Righty: 3325
Lefty: 1321
Both: 1


In [49]:
def getDecade(series):
    debut_date, final_game = series['debutDate'], series['finalGameDate']
    avg_year = (final_game.year + debut_date.year)/2
    
    decade = round(avg_year/10)*10
    
    
    
    
    return decade


In [50]:
career_pitching_df_copy = career_pitching_df.copy()
career_pitching_df_copy['Decade Played In'] = career_pitching_df_copy.apply(getDecade, axis =1)
career_pitching_df_copy



Unnamed: 0_level_0,Games,GamesStarted,CompleteGames,Shutouts,GamesFinished,Saves,InningsPitched,Hits,BFP,Homeruns,...,finalGameDate,bats,throws,CareerLength_Years,MonthsExtra,birthState,birthCountry,PlayerName,YearOfInduction,Decade Played In
PlayerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aardd001,331,0,0,0,141,69,337,296,1475,41,...,2015-08-23,R,R,12,4,Colorado,USA,David Aardsma,0.0,2010
aased001,448,91,22,5,235,82,1109,1085,4730,89,...,1990-10-03,R,R,14,2,California,USA,Don Aase,0.0,1980
abadf001,384,6,0,0,97,2,330,309,1399,42,...,2019-09-28,L,L,10,2,La Romana,Dominican Republic,Fernando Abad,0.0,2010
abbog001,248,206,37,5,13,0,1286,1405,5508,162,...,1984-08-08,R,R,11,0,Arkansas,USA,Glenn Abbott,0.0,1980
abboj001,263,254,31,6,5,0,1674,1779,7211,154,...,1999-07-21,L,L,11,3,Michigan,USA,Jim Abbott,0.0,1990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zimmk001,31,1,0,0,7,0,41,42,193,2,...,2020-09-22,R,R,2,5,California,USA,Kyle Zimmer,0.0,2020
zitob001,433,421,12,5,9,0,2576,2381,11001,282,...,2015-09-30,L,L,16,2,Nevada,USA,Barry Zito,0.0,2010
zumaj001,171,0,0,0,35,5,209,169,911,18,...,2010-06-28,R,R,5,2,California,USA,Joel Zumaya,0.0,2010
zuveg101,265,31,9,2,139,40,642,660,2746,56,...,1959-06-15,R,R,9,1,Michigan,USA,George Zuverink,0.0,1960


In [52]:
decade_group_by = career_pitching_df_copy.groupby('Decade Played In')
decade_group_by.mean()

Unnamed: 0_level_0,Games,GamesStarted,CompleteGames,Shutouts,GamesFinished,Saves,InningsPitched,Hits,BFP,Homeruns,...,Wins,Losses,ERA,RunSupport,PW,AVGGamesPerYear,careerLength,CareerLength_Years,MonthsExtra,YearOfInduction
Decade Played In,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1950,216.32,115.34,45.57,8.05,54.57,11.8,973.49,957.71,4176.72,80.28,...,56.78,53.21,4.41,4.23,2.25,20.88,9.49,9.49,4.78,53.37
1960,178.61,71.5,19.64,4.31,53.43,13.1,643.9,624.01,2744.3,62.88,...,35.66,36.44,4.51,3.75,0.25,20.87,7.12,7.12,4.14,43.92
1970,196.99,78.08,19.85,4.91,58.09,15.0,699.63,657.72,2960.53,59.75,...,39.59,38.62,4.04,3.31,0.37,23.11,7.38,7.38,4.38,27.99
1980,207.88,84.79,17.57,4.07,62.19,16.16,745.99,734.34,3176.27,63.31,...,41.43,41.61,4.28,3.71,0.56,22.95,7.69,7.69,4.09,36.21
1990,201.62,64.38,7.79,2.17,62.31,19.47,584.52,576.5,2497.49,55.85,...,32.95,32.79,4.59,3.25,0.79,22.92,7.52,7.52,4.91,11.4
2000,201.14,55.2,2.91,0.94,53.56,14.18,495.5,510.42,2154.07,58.47,...,27.66,28.06,5.23,3.32,0.2,23.15,7.32,7.32,4.47,16.06
2010,188.22,51.52,1.58,0.68,44.62,11.23,447.01,440.81,1914.14,51.29,...,25.45,24.94,4.87,2.75,0.38,23.88,6.81,6.81,4.64,2.14
2020,104.61,24.9,0.32,0.17,24.19,6.42,221.66,211.61,945.72,28.21,...,12.41,12.34,4.83,2.68,0.34,21.32,4.38,4.38,3.65,0.0
