In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
import numpy as np
from numpy import float32
from sklearn.metrics import precision_score, recall_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

## Loading the table

In [2]:
df = pd.read_csv('Data/Players/cleaned_data_ver2.csv')

In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df.shape

(255997, 99)

In [5]:
df['Rookie?'].value_counts(dropna=False)  # class imbalance

0    252834
1      3163
Name: Rookie?, dtype: int64

In [6]:
df.columns

Index([u'Rookie?', u'player_id', u'Name', u'Height', u'Weight', u'Drafted',
       u'Catcher', u'Centerfielder', u'Designated', u'First', u'Hitter',
       u'Leftfielder', u'N/A', u'Outfielder', u'Pitcher', u'Rightfielder',
       u'Second', u'Shortstop', u'Third', u'Bats_both', u'Bats_left',
       u'Bats_Right', u'Bats_unknown', u'Throws_left', u'Throws_right',
       u'Throws_unknown', u'Year', u'Age_batting', u'AgeDif_batting', u'Tm',
       u'Lg_batting', u'Lev', u'Aff_batting', u'G_batting', u'PA', u'AB',
       u'R_batting', u'H_batting', u'2B', u'3B', u'HR_batting', u'RBI',
       u'SB_batting', u'CS_batting', u'BB_batting', u'SO_batting', u'BA',
       u'OBP', u'SLG', u'OPS', u'TB', u'GDP', u'HBP_batting', u'SH', u'SF',
       u'IBB_batting', u'Age_pitching', u'AgeDif_pitching', u'Lg_pitching',
       u'Aff_pitching', u'W', u'L', u'ERA', u'RAvg', u'G_pitching', u'GS',
       u'GF', u'CG', u'SHO', u'SV', u'IP', u'H_pitching', u'R_pitching', u'ER',
       u'HR_pitching', u'BB_pi

# Splitting batters and pitchers

In [7]:
for i, v in enumerate(list(df.columns)):
    if v == 'Age_batting':
        print 'batting starts at: ' + str(i)
    if v == 'Lev':
        print 'pitching stats at: ' + str(i)
    if v == 'Age_pitching':
        print 'pitching stats at: ' + str(i)
    if v == 'Fielding_position':
        print 'fielding starts at: ' + str(i)
    if v == 'Tm':
        print 'Tm column has index of: ' + str(i)

batting starts at: 27
Tm column has index of: 29
pitching stats at: 31
pitching stats at: 56
fielding starts at: 88


In [8]:
pit_columns = df.columns - df.columns[27:28] - df.columns[30:31] - df.columns[32:56]  # these are the pitchers

  if __name__ == '__main__':


In [9]:
bat_columns = df.columns - df.columns[57:88]  # these are the batters (positional players)

  if __name__ == '__main__':


In [10]:
pit = df[df['Pitcher'] == 1][pit_columns]

In [11]:
bat = df[df['Pitcher'] == 0][bat_columns]

In [12]:
pit.shape

(130153, 73)

In [13]:
bat.shape

(125844, 68)

### Saving to two different csv files

In [16]:
bat.to_csv('/home/jun/Galvanize/capstone/Galvanize_capstone/Data/Players/batters.csv')

In [17]:
pit.to_csv('/home/jun/Galvanize/capstone/Galvanize_capstone/Data/Players/pitchers.csv')

------------------------------

In [14]:
df[(df['Lev'] != 'Fgn') & (df['Lev'] != 'Ind') & (df['Lev'] != 'FgW') &
   (df['Lev'] != 'Fal') & (df['Lev'] != 'FRk') & (df['Lev'] != 'WRk')].shape

(193193, 99)

In [385]:
df.Lev.unique()

array(['Fgn', 'Rk', 'A', 'A+', 'FgW', 'Ind', 'AA', 'AAA', 'A-', 'Fal',
       'FRk', 'WRk', 'MLB'], dtype=object)

In [384]:
df[['Name', 'Year', 'Tm', 'Lev']]

Unnamed: 0,Name,Year,Tm,Lev
0,Frank Del Valle,2007,Industriales,Fgn
1,Frank Del Valle,2008,Metropolitanos,Fgn
2,Frank Del Valle,2011,Cubs,Rk
3,Frank Del Valle,2011,Peoria,A
4,Frank Del Valle,2011,Daytona,A+
5,Frank Del Valle,2012,Daytona,A+
6,Frank Del Valle,2013,Daytona,A+
7,Frank Del Valle,2013,Margarita,FgW
8,Frank Del Valle,2014,Wichita,Ind
9,Frank Del Valle,2014,Margarita,FgW


In [574]:
bat[bat.A.isnull()].shape

(2441, 67)

In [27]:
pit[pit.SV == 1][['Lev', 'W', 'SV', 'G_pitching', 'GS', 'IP']]

Unnamed: 0,Lev,W,SV,G_pitching,GS,IP
33,FRk,1.0,1.0,11.0,1.0,26.2
67,AAA,2.0,1.0,38.0,0.0,44.2
120,Rk,1.0,1.0,14.0,8.0,51.2
121,A+,3.0,1.0,26.0,10.0,79.2
161,A-,4.0,1.0,14.0,11.0,61.1
162,A,6.0,1.0,32.0,15.0,117.2
168,Rk,1.0,1.0,18.0,0.0,24.2
182,Ind,0.0,1.0,10.0,0.0,9.0
254,A+,9.0,1.0,29.0,7.0,94.0
259,AA,0.0,1.0,7.0,4.0,30.1
