# Yankees All-Star Team

## Import packages and create SQL connection

In [51]:
import pandas as pd
from pandas.io import sql
import MySQLdb
import numpy as np
from sqlalchemy import create_engine

In [52]:
db = MySQLdb.connect(host="localhost",    
                     user="root",         
                     passwd="Commercial549!",  
                     db="lahman2016") 

# Create dataframes from SQL tables

### Only keep players that played at least half the season. Would have evaluated players using WAR, but unfortunately, I do not have the required data.

### Batting

In [53]:
query = """
SELECT * 
FROM 
    lahman2016.batting 
WHERE 
    teamid = 'NYA' OR teamid = 'BLA'
HAVING 
g >= 81
"""

batting = sql.read_sql(query, con=db)

In [54]:
#some values being read through as blank character, not NaN

bat_cols = batting.columns[5:]

for i in bat_cols:
    batting[i] = batting[i].apply(lambda x: str(x).strip()).replace('', np.nan)
    batting[i].fillna(0, inplace=True)
    batting[i] = batting[i].astype(int)

### I 'scored' players based on their OBP, RBI's vs the max RBI's a Yankee has ever hit, and Runs scored vs the max Runs a Yankee has ever scored. It's not perfect, but I think it will give us a good understanding on the most productive Yankees

In [55]:
batting['OBP'] = (batting['H']+batting['HBP']+batting['BB']+batting['IBB'])/\
                 (batting['AB']+batting['HBP']+batting['BB']+batting['IBB']+batting['SF'])
    
batting['Score'] = (batting['OBP']*100)+((batting['RBI']/batting['RBI'].max())*100)+\
                   ((batting['R']/batting['R'].max())*100)

### Fielding

In [56]:
query = """
SELECT * 
FROM 
    lahman2016.fielding 
WHERE 
    teamid = 'NYA' OR teamid = 'BLA'
HAVING
    g >= 81
"""

fielding = sql.read_sql(query, con=db)

In [57]:
field_cols = fielding.columns[6:]

for j in field_cols:
    fielding[j] = fielding[j].apply(lambda x: str(x).strip()).replace('', np.nan)
    fielding[j].fillna(0, inplace=True)
    fielding[j] = fielding[j].astype(int)

In [58]:
fielding['fielding_percentage'] = (fielding['PO']+fielding['A'])/(fielding['PO']+fielding['A']+fielding['E'])

In [59]:
#group fielders so they are not duplicated
fielding = fielding.groupby(['playerID', 'yearID', 'POS']).sum().reset_index()

In [60]:
yankees = pd.merge(fielding, batting, how='left', on=['playerID','yearID'], suffixes=('_fielding','_batting'))

#join tables and fill null values
yankees = yankees[['playerID', 'yearID', 'POS', 'Score', 'fielding_percentage']].fillna(0)

## Create table in SQL. Doing this so I can query my results as I would in SQL

In [11]:
engine = create_engine('mysql://root:Commercial549!@localhost:3306/lahman2016')
yankees.to_sql(name='yankees_scores', con=engine, if_exists='fail', index=False, chunksize=1000)

## Lets select our team

#### First Baseman

In [61]:
query = """
SELECT 
    *
FROM
    lahman2016.yankees_scores
WHERE
    pos = '1B' AND fielding_percentage > '0.97'
ORDER BY score DESC , fielding_percentage DESC limit 1,1;"""

sql.read_sql(query, con=db)

Unnamed: 0,playerID,yearID,POS,Score,fielding_percentage
0,gehrilo01,1927,1B,226.70328,0.991501


#### Second Baseman

In [62]:
query = """
SELECT 
    *
FROM
    lahman2016.yankees_scores
WHERE
    pos = '2B' AND fielding_percentage > '0.97'
ORDER BY score DESC , fielding_percentage DESC limit 1,1;"""

sql.read_sql(query, con=db)

Unnamed: 0,playerID,yearID,POS,Score,fielding_percentage
0,canoro01,2010,2B,156.726995,0.996134


#### Third Baseman

In [63]:
query = """
SELECT 
    *
FROM
    lahman2016.yankees_scores
WHERE
    pos = '3B' AND fielding_percentage > '0.97'
ORDER BY score DESC , fielding_percentage DESC limit 1,1;"""

sql.read_sql(query, con=db)

Unnamed: 0,playerID,yearID,POS,Score,fielding_percentage
0,rodrial01,2008,3B,154.867993,0.97006


#### Shortstop

In [46]:
query = """
SELECT 
    *
FROM
    lahman2016.yankees_scores
WHERE
    pos = 'SS' AND fielding_percentage > '0.97'
ORDER BY score DESC , fielding_percentage DESC limit 1,1;"""

sql.read_sql(query, con=db)

Unnamed: 0,playerID,yearID,POS,Score,fielding_percentage
0,jeterde01,2006,SS,161.37844,0.97541


#### Three Outfielders - Had to use top 7 to get 3 unique players

In [49]:
query = """
SELECT 
    *
FROM
    lahman2016.yankees_scores
WHERE
    pos = 'OF' AND fielding_percentage > '0.97'
ORDER BY score DESC , fielding_percentage DESC limit 1,7;"""

sql.read_sql(query, con=db)

Unnamed: 0,playerID,yearID,POS,Score,fielding_percentage
0,ruthba01,1928,OF,215.566084,0.975078
1,ruthba01,1926,OF,211.610445,0.978528
2,ruthba01,1923,OF,210.96041,0.973105
3,ruthba01,1929,OF,195.088593,0.983936
4,mantlmi01,1956,OF,192.108202,0.989583
5,mantlmi01,1961,OF,189.707238,0.983471
6,dimagjo01,1948,OF,185.997383,0.971861


#### Catcher - This result surprised me. I would have guessed Yogi Berra

In [47]:
query = """
SELECT 
    *
FROM
    lahman2016.yankees_scores
WHERE
    pos = 'C' AND fielding_percentage > '0.97'
ORDER BY score DESC , fielding_percentage DESC limit 1,1;"""

sql.read_sql(query, con=db)

Unnamed: 0,playerID,yearID,POS,Score,fielding_percentage
0,dickebi01,1937,C,163.115546,0.991014
