# Intro to pandas DataFrame iteration

### Iterating with .iterrows()

In [1]:
import pandas as pd

baseball_df = pd.read_csv("../datasets/baseball_stats.csv")
pit_df = baseball_df.loc[baseball_df.Team == 'PIT']

In [2]:
# Iterate over pit_df and print each index variable and then each row
for i,row in pit_df.iterrows():
    print(i)
    print(row)
    print(type(row))

21
Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
Name: 21, dtype: object
<class 'pandas.core.series.Series'>
51
Team              PIT
League             NL
Year             2011
RS                610
RA                712
W                  72
OBP             0.309
SLG             0.368
BA              0.244
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.338
OSLG            0.409
Name: 51, dtype: object
<class 'pandas.core.series.Series'>
81
Team              PIT
League             NL
Year             2010
RS                587
RA                866
W                  57
OBP             0.304
SLG             0.373
BA              0.242
Playoffs     

In [3]:
# Print the row and type of each row
for row_tuple in pit_df.iterrows():
    print(row_tuple)
    print(type(row_tuple))

(21, Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
Name: 21, dtype: object)
<class 'tuple'>
(51, Team              PIT
League             NL
Year             2011
RS                610
RA                712
W                  72
OBP             0.309
SLG             0.368
BA              0.244
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.338
OSLG            0.409
Name: 51, dtype: object)
<class 'tuple'>
(81, Team              PIT
League             NL
Year             2010
RS                587
RA                866
W                  57
OBP             0.304
SLG             0.373
BA              0.242
Playoffs            0
RankSeason        NaN
R

### Run differentials with .iterrows()

In [4]:
giants_df = baseball_df[baseball_df['Team'] == 'SFG']

In [5]:
def calc_run_diff(runs_scored, runs_allowed):
    run_diff = runs_scored - runs_allowed
    return run_diff

In [6]:
# Create an empty list to store run differentials
run_diffs = []

# Write a for loop and collect runs allowed and runs scored for each row
for i,row in giants_df.iterrows():
    runs_scored = row['RS']
    runs_allowed = row['RA']
    
    # Use the provided function to calculate run_diff for each row
    run_diff = calc_run_diff(runs_scored, runs_allowed)
    
    # Append each run differential to the output list
    run_diffs.append(run_diff)

giants_df['RD'] = run_diffs
print(giants_df)

     Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  \
24    SFG     NL  2012  718  649   94  0.327  0.397  0.269         1   
54    SFG     NL  2011  570  578   86  0.303  0.368  0.242         0   
84    SFG     NL  2010  697  583   92  0.321  0.408  0.257         1   
114   SFG     NL  2009  657  611   88  0.309  0.389  0.257         0   
144   SFG     NL  2008  640  759   72  0.321  0.382  0.262         0   
174   SFG     NL  2007  683  720   71  0.322  0.387  0.254         0   
204   SFG     NL  2006  746  790   76  0.324  0.422  0.259         0   
234   SFG     NL  2005  649  745   75  0.319  0.396  0.261         0   
265   SFG     NL  2004  850  770   91  0.357  0.438  0.270         0   
295   SFG     NL  2003  755  638  100  0.338  0.425  0.264         1   
325   SFG     NL  2002  783  616   95  0.344  0.442  0.267         1   
355   SFG     NL  2001  799  748   90  0.342  0.460  0.266         0   
385   SFG     NL  2000  925  747   97  0.362  0.472  0.278      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  giants_df['RD'] = run_diffs


# Another iterator method: .itertuples()

### Iterating with .itertuples()

In [7]:
# Import the dataset
rangers_df = baseball_df[baseball_df['Team'] == 'TEX']

In [8]:
# Loop over the DataFrame and print each row's Index, Year and Wins (W)
for row in rangers_df.itertuples():
  i = row.Index
  year = row.Year
  wins = row.W
  
  # Check if rangers made Playoffs (1 means yes; 0 means no)
  if row.Playoffs == 1:
    print(i, year, wins)

27 2012 93
57 2011 96
87 2010 90
418 1999 95
448 1998 88
504 1996 90


### Run differentials with .itertuples()

In [9]:
yankees_df = baseball_df[baseball_df['Team'] == 'NYY']

In [10]:
def calc_run_diff(runs_scored, runs_allowed):
    run_diff = runs_scored - runs_allowed
    return run_diff

In [11]:
run_diffs = []

# Loop over the DataFrame and calculate each row's run differential
for row in yankees_df.itertuples():
    runs_scored = row.RS
    runs_allowed = row.RA

    run_diff = calc_run_diff(runs_scored, runs_allowed)
    
    run_diffs.append(run_diff)

# Append new column
yankees_df['RD'] = run_diffs
print(yankees_df)

     Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  \
18    NYY     AL  2012  804  668   95  0.337  0.453  0.265         1   
48    NYY     AL  2011  867  657   97  0.343  0.444  0.263         1   
78    NYY     AL  2010  859  693   95  0.350  0.436  0.267         1   
108   NYY     AL  2009  915  753  103  0.362  0.478  0.283         1   
138   NYY     AL  2008  789  727   89  0.342  0.427  0.271         0   
168   NYY     AL  2007  968  777   94  0.366  0.463  0.290         1   
198   NYY     AL  2006  930  767   97  0.363  0.461  0.285         1   
228   NYY     AL  2005  886  789   95  0.355  0.450  0.276         1   
259   NYY     AL  2004  897  808  101  0.353  0.458  0.268         1   
289   NYY     AL  2003  877  716  101  0.356  0.453  0.271         1   
319   NYY     AL  2002  897  697  103  0.354  0.455  0.275         1   
349   NYY     AL  2001  804  713   95  0.334  0.435  0.267         1   
379   NYY     AL  2000  871  814   87  0.354  0.450  0.277      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yankees_df['RD'] = run_diffs


In [12]:
# to find the year where the Yankees Run Differnetial was the highest.
yankees_df.loc[yankees_df['RD'].idxmax()]

Team              NYY
League             AL
Year             1998
RS                965
RA                656
W                 114
OBP             0.364
SLG              0.46
BA              0.288
Playoffs            1
RankSeason        1.0
RankPlayoffs      1.0
G                 162
OOBP              NaN
OSLG              NaN
RD                309
Name: 439, dtype: object

# pandas alternative to looping

### Analyzing baseball stats with .apply()

In [13]:
rays_df = baseball_df[baseball_df['Team'] == 'TBR']
rays_df.set_index('Year', inplace=True)
rays_df.drop(['Team', 'League'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [14]:
def text_playoffs(num_playoffs): 
    if num_playoffs == 1:
        return 'Yes'
    else:
        return 'No' 

In [15]:
# Gather sum of all columns
stat_totals = rays_df.apply(sum, axis=0)
print(stat_totals)

RS              3783.000
RA              3265.000
W                458.000
OBP                1.655
SLG                2.060
BA                 1.254
Playoffs           3.000
RankSeason           NaN
RankPlayoffs         NaN
G                810.000
OOBP               1.543
OSLG               1.956
dtype: float64


In [16]:
# Gather total runs scored in all games per year
total_runs_scored = rays_df[['RS', 'RA']].apply(sum, axis=1)
print(total_runs_scored)

Year
2012    1274
2011    1321
2010    1451
2009    1557
2008    1445
dtype: int64


In [17]:
# Convert numeric playoffs to text by applying text_playoffs()
textual_playoffs = rays_df.apply(lambda row: text_playoffs(row['Playoffs']), axis=1)
print(textual_playoffs)

Year
2012     No
2011    Yes
2010    Yes
2009     No
2008    Yes
dtype: object


### Settle a debate with .apply()

In [18]:
dbacks_df = baseball_df[baseball_df['Team'] == 'ARI']

In [19]:
import numpy as np

def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

In [20]:
# Display the first five rows of the DataFrame
print(dbacks_df.head())

# Create a win percentage Series 
win_percs = dbacks_df.apply(lambda row: calc_win_perc(row['W'], row['G']), axis=1)
print(win_percs, '\n')

# Append a new column to dbacks_df
dbacks_df['WP'] = win_percs
print(dbacks_df, '\n')

# Display dbacks_df where WP is greater than 0.50
print(dbacks_df[dbacks_df['WP'] >= 0.50])

    Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
0    ARI     NL  2012  734  688  81  0.328  0.418  0.259         0   
30   ARI     NL  2011  731  662  94  0.322  0.413  0.250         1   
60   ARI     NL  2010  713  836  65  0.325  0.416  0.250         0   
90   ARI     NL  2009  720  782  70  0.324  0.418  0.253         0   
120  ARI     NL  2008  720  706  82  0.327  0.415  0.251         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG  
0           NaN           NaN  162  0.317  0.415  
30          5.0           4.0  162  0.316  0.409  
60          NaN           NaN  162  0.340  0.448  
90          NaN           NaN  162  0.330  0.419  
120         NaN           NaN  162  0.318  0.398  
0      0.50
30     0.58
60     0.40
90     0.43
120    0.51
150    0.56
180    0.47
210    0.48
241    0.31
271    0.52
301    0.60
331    0.57
361    0.52
391    0.62
421    0.40
dtype: float64 

    Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs  \
0    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dbacks_df['WP'] = win_percs


In [21]:
# to find the year where the Diamondbacks Win Percentage was greater than 0.50.
dbacks_df.loc[dbacks_df['WP'] > 0.5]

Unnamed: 0,Team,League,Year,RS,RA,W,OBP,SLG,BA,Playoffs,RankSeason,RankPlayoffs,G,OOBP,OSLG,WP
30,ARI,NL,2011,731,662,94,0.322,0.413,0.25,1,5.0,4.0,162,0.316,0.409,0.58
120,ARI,NL,2008,720,706,82,0.327,0.415,0.251,0,,,162,0.318,0.398,0.51
150,ARI,NL,2007,712,732,90,0.321,0.413,0.25,1,3.0,3.0,162,0.334,0.42,0.56
271,ARI,NL,2003,717,685,84,0.33,0.417,0.263,0,,,162,0.322,0.388,0.52
301,ARI,NL,2002,819,674,98,0.346,0.423,0.267,1,4.0,4.0,162,0.305,0.397,0.6
331,ARI,NL,2001,818,677,92,0.341,0.442,0.267,1,5.0,1.0,162,0.311,0.404,0.57
361,ARI,NL,2000,792,754,85,0.333,0.429,0.265,0,,,162,0.326,0.424,0.52
391,ARI,NL,1999,908,676,100,0.347,0.459,0.277,1,2.0,4.0,162,0.32,0.402,0.62


# Optimal pandas iterating

### Replacing .iloc with underlying arrays

In [25]:
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

win_percs_list = []

for i in range(len(baseball_df)):
    row = baseball_df.iloc[i]

    wins = row['W']
    games_played = row['G']

    win_perc = calc_win_perc(wins, games_played)

    win_percs_list.append(win_perc)

baseball_df['WP'] = win_percs_list

In [26]:
# Use the W array and G array to calculate win percentages
win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values)

# Append a new column to baseball_df that stores all win percentages
baseball_df['WP'] = win_percs_np

print(baseball_df.head())

  Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  ARI     NL  2012  734  688  81  0.328  0.418  0.259         0         NaN   
1  ATL     NL  2012  700  600  94  0.320  0.389  0.247         1         4.0   
2  BAL     AL  2012  712  705  93  0.311  0.417  0.247         1         5.0   
3  BOS     AL  2012  734  806  69  0.315  0.415  0.260         0         NaN   
4  CHC     NL  2012  613  759  61  0.302  0.378  0.240         0         NaN   

   RankPlayoffs    G   OOBP   OSLG    WP  
0           NaN  162  0.317  0.415  0.50  
1           5.0  162  0.306  0.378  0.58  
2           4.0  162  0.315  0.403  0.57  
3           NaN  162  0.331  0.428  0.43  
4           NaN  162  0.335  0.424  0.38  


In [35]:
# time the old method that uses .iloc vs the np array.
def iloc_method():
    for i in range(len(baseball_df)):
        row = baseball_df.iloc[i]
        wins = row['W']
        games_played = row['G']
        win_perc = calc_win_perc(wins, games_played)
        win_percs_list.append(win_perc)

print("Time pandas iloc method")
%timeit iloc_method()
print("Time np array method")
%timeit win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values)

Time iloc method
143 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Time np array method
17.3 µs ± 1.01 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### Bringing it all together: Predict win percentage

In [32]:
def predict_win_perc(RS, RA):
    prediction = RS ** 2 / (RS ** 2 + RA ** 2)
    return np.round(prediction, 2)

In [33]:
win_perc_preds_loop = []

# Use a loop and .itertuples() to collect each row's predicted win percentage
for row in baseball_df.itertuples():
    runs_scored = row.RS
    runs_allowed = row.RA
    win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
    win_perc_preds_loop.append(win_perc_pred)

# Apply predict_win_perc to each row of the DataFrame
win_perc_preds_apply = baseball_df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)

# Calculate the win percentage predictions using NumPy arrays
win_perc_preds_np = predict_win_perc(baseball_df['RS'].values, baseball_df['RA'].values)
baseball_df['WP_preds'] = win_perc_preds_np
print(baseball_df.head())

  Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  ARI     NL  2012  734  688  81  0.328  0.418  0.259         0         NaN   
1  ATL     NL  2012  700  600  94  0.320  0.389  0.247         1         4.0   
2  BAL     AL  2012  712  705  93  0.311  0.417  0.247         1         5.0   
3  BOS     AL  2012  734  806  69  0.315  0.415  0.260         0         NaN   
4  CHC     NL  2012  613  759  61  0.302  0.378  0.240         0         NaN   

   RankPlayoffs    G   OOBP   OSLG    WP  WP_preds  
0           NaN  162  0.317  0.415  0.50      0.53  
1           5.0  162  0.306  0.378  0.58      0.58  
2           4.0  162  0.315  0.403  0.57      0.50  
3           NaN  162  0.331  0.428  0.43      0.45  
4           NaN  162  0.335  0.424  0.38      0.39  


In [34]:
# timing all three solutions.

# move the loop into a function
def win_perc_preds_iter():
    for row in baseball_df.itertuples():
        runs_scored = row.RS
        runs_allowed = row.RA
        win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
        win_perc_preds_loop.append(win_perc_pred)

# Time each one.
print("Time pandas itertuples method")
%timeit win_perc_preds_iter()
print("Time pandas apply method")
%timeit win_perc_preds_apply
print("Time numpy array method")
%timeit win_perc_preds_np

Timing itertuples method
14.8 ms ± 152 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Timing apply method
16.9 ns ± 0.163 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
Timing numpy array method
17.6 ns ± 0.707 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
