In [4]:
import babypandas as bpd
import numpy as np

In [37]:
df = bpd.read_csv('data/player_data.csv')
df

Unnamed: 0,Name,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
0,James Harden,25,HOU,81,459,565,154,60,321,2217
1,Chris Paul,29,LAC,82,376,838,156,15,190,1564
2,Stephen Curry,26,GSW,80,341,619,163,16,249,1900
3,Anthony Davis,21,NOP,68,696,149,100,200,95,1656
4,DeAndre Jordan,26,LAC,82,1226,61,81,183,109,946
...,...,...,...,...,...,...,...,...,...,...
487,Adreian Payne,23,TOT,32,162,30,19,9,44,213
488,Ricky Ledo,22,TOT,17,36,19,6,1,26,90
489,Gary Harris,20,DEN,55,64,29,39,7,38,188
490,Zach LaVine,19,MIN,77,214,276,54,10,193,778


---

# Top Ten Table Patterns

## And some variations

Let's look at the most common patterns we have been using on tables. They are quite simple when you have a computer. 

However, for the exam, you really need to get familiar with them.

Best way to study: Study by writing code with pen and paper. Learn to check your code for logical and syntax errors, without the help of Python!

# 0) Get and Drop Columns

**Pattern**: `df.get(column_name)`

**Pattern**: `df.drop(columns = column_name)`

Where column_name is a string

In [29]:
df.get('Points')

0      2217
1      1564
2      1900
3      1656
4       946
       ... 
487     213
488      90
489     188
490     778
491     501
Name: Points, Length: 492, dtype: int64

In [30]:
df.get('Age')

0      25
1      29
2      26
3      21
4      26
       ..
487    23
488    22
489    20
490    19
491    24
Name: Age, Length: 492, dtype: int64

### Drop column "Age"

In [31]:
df_modified = df.drop("Age")

TypeError: drop() takes 1 positional argument but 2 were given

In [38]:
df_modified = df.drop(columns = "Age")
df_modified

Unnamed: 0,Name,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
0,James Harden,HOU,81,459,565,154,60,321,2217
1,Chris Paul,LAC,82,376,838,156,15,190,1564
2,Stephen Curry,GSW,80,341,619,163,16,249,1900
3,Anthony Davis,NOP,68,696,149,100,200,95,1656
4,DeAndre Jordan,LAC,82,1226,61,81,183,109,946
...,...,...,...,...,...,...,...,...,...
487,Adreian Payne,TOT,32,162,30,19,9,44,213
488,Ricky Ledo,TOT,17,36,19,6,1,26,90
489,Gary Harris,DEN,55,64,29,39,7,38,188
490,Zach LaVine,MIN,77,214,276,54,10,193,778


In [39]:
df_modified = df.drop(columns = ["Age", "Team", "Games"])
df_modified

Unnamed: 0,Name,Rebounds,Assists,Steals,Blocks,Turnovers,Points
0,James Harden,459,565,154,60,321,2217
1,Chris Paul,376,838,156,15,190,1564
2,Stephen Curry,341,619,163,16,249,1900
3,Anthony Davis,696,149,100,200,95,1656
4,DeAndre Jordan,1226,61,81,183,109,946
...,...,...,...,...,...,...,...
487,Adreian Payne,162,30,19,9,44,213
488,Ricky Ledo,36,19,6,1,26,90
489,Gary Harris,64,29,39,7,38,188
490,Zach LaVine,214,276,54,10,193,778


# 1) Get something by its label & index

**Example**: how many points did LeBron James have?

**Pattern**: `df.get(column_name).loc[row_label].`

### Getting data by its label

In [40]:
df = df.set_index('Name')

### Get the points of the player: LeBron James.

In [41]:
df.get('Points').loc['LeBron James']

1743

### Get the Games of the player: Chris Paul.

In [42]:
df.get('Games').loc['Chris Paul']

82

### Getting Multiple datapoints by their labels

Get the points of players James Harden, Stephen Curry, Adreian Payne

In [43]:
query_players = ["James Harden", "Stephen Curry", "Adreian Payne"]
df.get('Points').loc[query_players]

Name
James Harden     2217
Stephen Curry    1900
Adreian Payne     213
Name: Points, dtype: int64

# 2) Find the label with the largest/smallest value.

**Example**: Player with the most points?

**Pattern**: `df.sort_values(by = "Points").iloc[-1]`

**Pattern**: `df.sort_values(by = "Points", ascending = False).iloc[0]`

### According to score, get the point and name of best player

In [44]:
df.get("Points").sort_values()

Name
Malcolm Lee             0
Jerrelle Benimon        0
Ronny Turiaf            0
David Wear              0
Kalin Lucas             0
                     ... 
Damian Lillard       1720
LeBron James         1743
Russell Westbrook    1886
Stephen Curry        1900
James Harden         2217
Name: Points, Length: 492, dtype: int64

In [45]:
df = df.sort_values()

TypeError: sort_values() missing 1 required positional argument: 'by'

In [46]:
df = df.sort_values(by = "Points") # ascending

In [47]:
# Wrong
df.get('Points').iloc[0]

0

In [48]:
# Correct
df.get('Points').iloc[-1]

2217

In [49]:
df.index[-1]

'Malcolm Lee'

### Get the name and age of 5th oldest player

In [58]:
df

Unnamed: 0_level_0,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Dante Exum,19,UTA,82,131,198,41,14,118,393
Bruno Caboclo,19,TOR,8,2,0,0,1,4,10
Jabari Parker,19,MIL,25,138,42,31,5,47,308
Zach LaVine,19,MIN,77,214,276,54,10,193,778
Noah Vonleh,19,CHO,25,86,4,4,9,11,83
...,...,...,...,...,...,...,...,...,...
Kenyon Martin,37,MIL,11,19,5,5,6,3,20
Andre Miller,38,TOT,81,153,284,32,6,104,355
Vince Carter,38,MEM,66,133,79,43,14,43,384
Tim Duncan,38,SAS,77,704,230,63,151,131,1070


In [59]:
df = df.sort_values(by = "Age") # ascending
df.get("Age").iloc[-5]

37

In [60]:
df.index[-5]

'Nazr Mohammed'

### Get the names & ages of the oldest 5 players

In [260]:
query_range = np.arange(0, 4+1)

(df
    .get('Age')
    .sort_values(ascending = False)
    .iloc[query_range]
)

Name
Kevin Garnett    38
Vince Carter     38
Andre Miller     38
Tim Duncan       38
Jason Terry      37
Name: Age, dtype: int64

### Get the name of the oldest player, who also has the most number of points.

In [61]:
df = df.sort_values(by = ["Age", "Points"]) 

In [62]:
df.index[-1]

'Tim Duncan'

### Get the age and points of this player

In [63]:
df.get("Age").iloc[-1]

38

In [64]:
df.get("Points").iloc[-1]

1070

---

# 3) Compute a statistic for a subset. Filter to get the subset.

**Example**: Players info for players with age >= 30

**Pattern**:

`bool_mask = df.get('Age') >= 30
df[bool_mask]
`

### Return a table containing entries for players with age >= 30

In [65]:
bool_mask = df.get('Age') >= 30
df[bool_mask]

Unnamed: 0_level_0,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Lester Hudson,30,LAC,5,8,5,6,1,3,18
Cartier Martin,30,DET,23,20,11,3,1,5,36
Raymond Felton,30,DAL,29,26,41,11,4,18,108
Ryan Hollins,30,SAC,46,103,14,6,19,25,137
Nate Robinson,30,TOT,42,50,97,20,3,30,238
...,...,...,...,...,...,...,...,...,...
Paul Pierce,37,WAS,73,294,144,46,24,92,868
Kevin Garnett,38,TOT,47,311,77,46,17,46,323
Andre Miller,38,TOT,81,153,284,32,6,104,355
Vince Carter,38,MEM,66,133,79,43,14,43,384


### Get the mean points for players with age >= 30

In [263]:
bool_mask = df.get('Age') >= 30
df[bool_mask].get('Points').mean()

487.4642857142857

### Calculate the mean of points. Get the mean age of the players, who have scored higher than or equal to the mean points.

In [66]:
mean_points = df.get('Points').mean()
bool_mask = df.get('Points') >= mean_points
df[bool_mask].get('Age').mean()

26.854460093896712

# 4) Combining Conditions, Filtering and Getting Statistics

**Example**: Players with more than 600 assists and 100 steals

**Pattern**:

`mask1 = df.get('Assists') > 600
mask2 = df.get('Steals') > 100
bool_mask = mask1 & mask2
df[bool_mask]
`

**Pattern**: Don't forget the parantheses if you write it like below:

`
df[(...) & (...) & (...)]
df[(df.get('Assists') > 600) & (df.get('Steals') > 100)]
`

### Filter the table, players who have more than 600 assists and more than 100 steals

In [67]:
mask1 = df.get('Assists') > 600
mask2 = df.get('Steals') > 100
bool_mask = mask1 & mask2
df[bool_mask]

Unnamed: 0_level_0,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
John Wall,24,WAS,79,366,792,138,45,304,1387
Stephen Curry,26,GSW,80,341,619,163,16,249,1900
Chris Paul,29,LAC,82,376,838,156,15,190,1564


### How many players have more than 300 rebounds and more than 20 blocks?

In [75]:
mask1 = df.get('Rebounds') > 1000
mask2 = df.get('Blocks') > 100
bool_mask = mask1 & mask2
df[bool_mask].shape[0]

2

In [76]:
### Who are these players?

In [77]:
bool_mask = mask1 & mask2
df[bool_mask].index

Index(['Andre Drummond', 'DeAndre Jordan'], dtype='object', name='Name')

### Calculate the median age of a subset of the players.

This subset includes, the players who scored higher than the mean points, and played more than 40 games.

In [78]:
mean_points = df.get('Points').mean()
mask1 = df.get('Points') >= mean_points
mask2 = df.get('Games') > 40
bool_mask = mask1 & mask2
df[bool_mask].get('Age').median()

27.0

# 5) Compute statistics for a group. 

**Pattern**:

`df.groupby(column_name).func()
`
Where func is the aggrageting function

### Get the total score sum for each team.

In [80]:
df.groupby('Team').sum()

Unnamed: 0_level_0,Age,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ATL,383,857,3308,2103,739,378,1098,8378
BOS,243,625,2666,1393,484,229,796,5815
BRK,410,803,3005,1600,495,316,1004,7335
CHI,396,821,3751,1781,514,476,1080,8265
CHO,359,768,3435,1406,464,440,784,6766
...,...,...,...,...,...,...,...,...
SAS,434,931,3511,1991,649,439,1084,8350
TOR,389,840,3407,1701,615,357,1000,8527
TOT,2001,3939,13093,8268,2691,1270,5017,32637
UTA,357,747,3166,1577,578,470,1052,6923


In [81]:
df.groupby('Team').sum().get(["Points"])

Unnamed: 0_level_0,Points
Team,Unnamed: 1_level_1
ATL,8378
BOS,5815
BRK,7335
CHI,8265
CHO,6766
...,...
SAS,8350
TOR,8527
TOT,32637
UTA,6923


### How many players does each team have?

In [82]:
df.groupby('Team').count().get(["Points"])

Unnamed: 0_level_0,Points
Team,Unnamed: 1_level_1
ATL,14
BOS,10
BRK,15
CHI,14
CHO,14
...,...
SAS,15
TOR,15
TOT,76
UTA,15


### Get the mean number of games & mean points for each team

In [290]:
df.groupby('Team').mean().get(["Games", "Points"])

Unnamed: 0_level_0,Games,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
ATL,61.214286,598.428571
BOS,62.500000,581.500000
BRK,53.533333,489.000000
CHI,58.642857,590.357143
CHO,54.857143,483.285714
...,...,...
SAS,62.066667,556.666667
TOR,56.000000,568.466667
TOT,51.828947,429.434211
UTA,49.800000,461.533333


### Get the sum of Points per Game for each team

In [90]:
new_col = df.get("Points") / df.get("Games")
df_new = (df
    .assign(Points_Per_Game = new_col)
    .sort_values(by = "Points_Per_Game",ascending = False)
)
df_new

Unnamed: 0_level_0,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points,Points_Per_Game
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Russell Westbrook,26,OKC,67,488,574,140,14,293,1886,28.149254
James Harden,25,HOU,81,459,565,154,60,321,2217,27.370370
Kevin Durant,26,OKC,27,178,110,24,25,74,686,25.407407
LeBron James,30,CLE,69,416,511,109,49,272,1743,25.260870
Anthony Davis,21,NOP,68,696,149,100,200,95,1656,24.352941
...,...,...,...,...,...,...,...,...,...,...
Jerrelle Benimon,23,UTA,2,3,0,0,0,1,0,0.000000
Kalin Lucas,25,MEM,1,0,0,1,0,0,0,0.000000
Malcolm Lee,24,PHI,1,0,0,0,0,0,0,0.000000
Seth Curry,24,PHO,2,2,1,0,0,0,0,0.000000


In [91]:
df_new.groupby("Team").sum().get("Points_Per_Game")

Team
ATL    125.177634
BOS     83.481282
BRK    111.453547
CHI    124.249599
CHO    112.837391
          ...    
SAS    123.804924
TOR    120.076444
TOT    535.714733
UTA    112.119284
WAS    116.147623
Name: Points_Per_Game, Length: 31, dtype: float64

# 6) Apply function & Conditionals

**Pattern**: `df.get(a_column).apply(a_function)`

### Given a full name, write a function that finds how many words it has

In [97]:
def find_name_len(string):
    """ Finds how many words the name contains """
    return len(string.split())

In [98]:
find_name_len("Arda Cankat Bati")

3

In [99]:
find_name_len("Tony Montana")

2

In [100]:
df.reset_index()

Unnamed: 0,Name,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
0,Bruno Caboclo,19,TOR,8,2,0,0,1,4,10
1,Noah Vonleh,19,CHO,25,86,4,4,9,11,83
2,James Young,19,BOS,31,42,13,8,2,5,105
3,Aaron Gordon,19,ORL,47,169,33,21,22,38,243
4,Jabari Parker,19,MIL,25,138,42,31,5,47,308
...,...,...,...,...,...,...,...,...,...,...
487,Paul Pierce,37,WAS,73,294,144,46,24,92,868
488,Kevin Garnett,38,TOT,47,311,77,46,17,46,323
489,Andre Miller,38,TOT,81,153,284,32,6,104,355
490,Vince Carter,38,MEM,66,133,79,43,14,43,384


In [101]:
### Apply your function to the Name column

In [96]:
df.reset_index().get("Name").apply(find_name_len)

0      2
1      2
2      2
3      2
4      2
      ..
487    2
488    2
489    2
490    2
491    2
Name: Name, Length: 492, dtype: int64

### Find the longest full name (by number of words in it)

In [103]:
( df
 .reset_index()
 .get("Name")
 .apply(find_name_len)
 .max()
)

4

### Write a function that assigns an age group to a given age
* age less than 20 --> "young"
* age less than 30 --> "mid"
* age more than or equal 30 --> "old"

I am not saying 30 is old! :)

In [105]:
def assign_age_group(age):
    if age <= 20:
        return "young"
    elif age <= 30:
        return "mid"
    else:
        return "old"

In [106]:
assign_age_group(19)

'young'

In [107]:
assign_age_group(35)

'old'

### Add a new column to the table, which shows the age group of each player

In [109]:
new_col = df.get("Age").apply(assign_age_group)
df_new = df.assign(Age_Group = new_col)
df_new

Unnamed: 0_level_0,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points,Age_Group
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bruno Caboclo,19,TOR,8,2,0,0,1,4,10,young
Noah Vonleh,19,CHO,25,86,4,4,9,11,83,young
James Young,19,BOS,31,42,13,8,2,5,105,young
Aaron Gordon,19,ORL,47,169,33,21,22,38,243,young
Jabari Parker,19,MIL,25,138,42,31,5,47,308,young
...,...,...,...,...,...,...,...,...,...,...
Paul Pierce,37,WAS,73,294,144,46,24,92,868,old
Kevin Garnett,38,TOT,47,311,77,46,17,46,323,old
Andre Miller,38,TOT,81,153,284,32,6,104,355,old
Vince Carter,38,MEM,66,133,79,43,14,43,384,old


# 7) Groupby Multiple Columns and look at statistics


**Pattern**:

`df.groupby([column_name1, column_name2]).func()
`
Where func is the aggrageting function

* There should always be an aggregating function. Otherwise we just get a groupby object.
* Don't forget to use reset_index() after grouping

### Get the number of players in each team and in each age group

In [115]:
df_new.groupby(["Team", "Age_Group"])

<babypandas.bpd.DataFrameGroupBy at 0x7efc87185080>

In [116]:
(df_new
 .groupby(["Team", "Age_Group"])
 .count()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
Team,Age_Group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ATL,mid,11,11,11,11,11,11,11,11
ATL,old,3,3,3,3,3,3,3,3
BOS,mid,7,7,7,7,7,7,7,7
BOS,old,1,1,1,1,1,1,1,1
BOS,young,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...
TOT,young,1,1,1,1,1,1,1,1
UTA,mid,14,14,14,14,14,14,14,14
UTA,young,1,1,1,1,1,1,1,1
WAS,mid,10,10,10,10,10,10,10,10


In [117]:
df_new

Unnamed: 0_level_0,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points,Age_Group
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bruno Caboclo,19,TOR,8,2,0,0,1,4,10,young
Noah Vonleh,19,CHO,25,86,4,4,9,11,83,young
James Young,19,BOS,31,42,13,8,2,5,105,young
Aaron Gordon,19,ORL,47,169,33,21,22,38,243,young
Jabari Parker,19,MIL,25,138,42,31,5,47,308,young
...,...,...,...,...,...,...,...,...,...,...
Paul Pierce,37,WAS,73,294,144,46,24,92,868,old
Kevin Garnett,38,TOT,47,311,77,46,17,46,323,old
Andre Miller,38,TOT,81,153,284,32,6,104,355,old
Vince Carter,38,MEM,66,133,79,43,14,43,384,old


In [123]:
df_groups = (df_new
 .groupby(["Team", "Age_Group"]) # When we groupby o=some columns, those columns become the index
 .count()
 .get(["Team", "Age_Group", "Games"])
)
df_groups

KeyError: "['Team' 'Age_Group'] not found in columns"

In [124]:
df_groups = (df_new
 .groupby(["Team", "Age_Group"])
 .count()
 .reset_index()
 .get(["Team", "Age_Group", "Games"])
)
df_groups

Unnamed: 0,Team,Age_Group,Games
0,ATL,mid,11
1,ATL,old,3
2,BOS,mid,7
3,BOS,old,1
4,BOS,young,2
...,...,...,...
70,TOT,young,1
71,UTA,mid,14
72,UTA,young,1
73,WAS,mid,10


# 8) Rename a column

**Pattern**: Store the column to be renamed, assign it to with a new column name, drop the column with the old name.

`new_col = df.get(old_column_name)
df = (df.assign(new_col_name = new_col).drop(columns = old_column_name))`

In [296]:
df_groups

Unnamed: 0,Team,Age_Group,Games
0,ATL,mid,11
1,ATL,old,3
2,BOS,mid,7
3,BOS,old,1
4,BOS,young,2
...,...,...,...
70,TOT,young,1
71,UTA,mid,14
72,UTA,young,1
73,WAS,mid,10


### Rename the Games column to Player_Count

In [125]:
new_col = df_groups.get("Games")
df_groups = (df_groups
             .assign(Player_Count = new_col)
             .drop(columns = "Games")
            )
df_groups

Unnamed: 0,Team,Age_Group,Player_Count
0,ATL,mid,11
1,ATL,old,3
2,BOS,mid,7
3,BOS,old,1
4,BOS,young,2
...,...,...,...
70,TOT,young,1
71,UTA,mid,14
72,UTA,young,1
73,WAS,mid,10


# 9) Get all rows containing a string.

**Example** Find all players who have the word "James" somewhere in their full name.

**Pattern**

`bool_mask = df.get(column_of_strings).str.contains('James')
df[bool_mask]
`

### Filter the table so that only players with the substring "James" in their full name remain.

In [127]:
bool_mask = df.get("Name").str.contains("James")
df[bool_mask]

Unnamed: 0,index,Name,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
2,2,James Young,19,BOS,31,42,13,8,2,5,105
49,49,James McAdoo,22,GSW,15,37,2,5,9,6,62
161,161,James Ennis,24,MIA,62,176,48,25,17,39,312
226,226,James Harden,25,HOU,81,459,565,154,60,321,2217
291,291,James Johnson,27,TOR,70,257,95,54,70,79,554
349,349,Bernard James,29,DAL,16,39,4,2,15,6,44
403,403,LeBron James,30,CLE,69,416,511,109,49,272,1743
457,457,James Jones,34,CLE,57,62,24,13,8,13,250


### Filter the table so that only players with the substring "gg" in their full name remain.

In [128]:
bool_mask = df.get("Name").str.contains("gg")
df[bool_mask]

Unnamed: 0,index,Name,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
7,7,Andrew Wiggins,19,MIN,82,374,170,86,50,177,1387
92,92,Reggie Bullock,23,TOT,36,50,8,10,4,8,69
179,179,Reggie Jackson,24,TOT,77,327,462,62,9,186,1117
311,311,Reggie Williams,28,SAS,20,17,10,1,0,2,37
456,456,Reggie Evans,34,SAC,47,299,32,22,5,45,176


### Only players with the substring "Reg" and substring "ie" in their full name remain.

In [324]:
mask1 = df.get("Name").str.contains("ie")
mask2 = df.get("Name").str.contains("Reg")
df[mask1 & mask2]

Unnamed: 0,index,Name,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
79,79,Reggie Jackson,24,TOT,77,327,462,62,9,186,1117
294,294,Reggie Evans,34,SAC,47,299,32,22,5,45,176
401,401,Reggie Williams,28,SAS,20,17,10,1,0,2,37
403,403,Reggie Bullock,23,TOT,36,50,8,10,4,8,69


### Why did we have just 9 points instead of 10?

### Because we had 0 indexing :)

# Top 8 Possible Pitfalls & Things to Keep in Mind

## 0) Difference between & and "and"

Always use "and" with conditionals, always use & with boolean arrays.

In [129]:
True and False

False

In [130]:
np.array([True, False, True]) & np.array([False, False, False])

array([False, False, False])

## 1) Parentheses when combining conditionals:

In [131]:
df[df.get("Age") >= 25 & df.get("Points") >= 2000]

TypeError: unsupported operand type(s) for &: 'int' and 'Series'

In [132]:
df[(df.get("Age") >= 25) & (df.get("Points") >= 1800)]

Unnamed: 0,index,Name,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
226,226,James Harden,25,HOU,81,459,565,154,60,321,2217
269,269,Russell Westbrook,26,OKC,67,488,574,140,14,293,1886
270,270,Stephen Curry,26,GSW,80,341,619,163,16,249,1900


## 2) Column names are meaningless after a `groupby` and count!

In [133]:
df.groupby("Team").count()

Unnamed: 0_level_0,index,Name,Age,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ATL,14,14,14,14,14,14,14,14,14,14
BOS,10,10,10,10,10,10,10,10,10,10
BRK,15,15,15,15,15,15,15,15,15,15
CHI,14,14,14,14,14,14,14,14,14,14
CHO,14,14,14,14,14,14,14,14,14,14
...,...,...,...,...,...,...,...,...,...,...
SAS,15,15,15,15,15,15,15,15,15,15
TOR,15,15,15,15,15,15,15,15,15,15
TOT,76,76,76,76,76,76,76,76,76,76
UTA,15,15,15,15,15,15,15,15,15,15


In [134]:
# Has no relation to the actual "Steals" and "Blocks" columns
df.groupby("Team").count().get(["Steals", "Blocks"])

Unnamed: 0_level_0,Steals,Blocks
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
ATL,14,14
BOS,10,10
BRK,15,15
CHI,14,14
CHO,14,14
...,...,...
SAS,15,15
TOR,15,15
TOT,76,76
UTA,15,15


## 3) Reset index, especially after grouping with multiple columns.

In [135]:
(df_new
 .groupby(["Team", "Age_Group"])
 .count()
 .reset_index()
)

Unnamed: 0,Team,Age_Group,Age,Games,Rebounds,Assists,Steals,Blocks,Turnovers,Points
0,ATL,mid,11,11,11,11,11,11,11,11
1,ATL,old,3,3,3,3,3,3,3,3
2,BOS,mid,7,7,7,7,7,7,7,7
3,BOS,old,1,1,1,1,1,1,1,1
4,BOS,young,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...
70,TOT,young,1,1,1,1,1,1,1,1
71,UTA,mid,14,14,14,14,14,14,14,14
72,UTA,young,1,1,1,1,1,1,1,1
73,WAS,mid,10,10,10,10,10,10,10,10


## 4) `iloc[]` vs `loc[]` vs array indexing`[]`

In [136]:
# Before using loc, make sure of what type of index you have:
df.index

RangeIndex(start=0, stop=492, step=1)

In [137]:
df = df.set_index("Name")

In [138]:
df.get("Age").loc["Stephen Curry"]

26

In [139]:
df.get("Age").iloc[2]

19

In [140]:
df.index[2]

'James Young'

## 5) Not specifying column while sorting table

Wrong: `df = df.sort_values(ascending = False)` 

Correct: `df = df.sort_values(by = column_name, ascending = False)` 

## 6) Trying to get the index using .get() instead of .index

In [141]:
# df.get("Name")
df.index

Index(['Bruno Caboclo', 'Noah Vonleh', 'James Young', 'Aaron Gordon',
       'Jabari Parker', 'Dante Exum', 'Zach LaVine', 'Andrew Wiggins',
       'Julius Randle', 'Clint Capela',
       ...
       'Kenyon Martin', 'Nazr Mohammed', 'Pablo Prigioni', 'Jason Terry',
       'Manu Ginobili', 'Paul Pierce', 'Kevin Garnett', 'Andre Miller',
       'Vince Carter', 'Tim Duncan'],
      dtype='object', name='Name', length=492)

# 7) Using df.drop with missing argument

`df.drop(columns = column_name)` without columns, for example `df.drop(column_name)` is wrong.

In [142]:
# df.drop("Points")
df.drop(columns = "Points")

Unnamed: 0_level_0,index,Age,Team,Games,Rebounds,Assists,Steals,Blocks,Turnovers
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bruno Caboclo,0,19,TOR,8,2,0,0,1,4
Noah Vonleh,1,19,CHO,25,86,4,4,9,11
James Young,2,19,BOS,31,42,13,8,2,5
Aaron Gordon,3,19,ORL,47,169,33,21,22,38
Jabari Parker,4,19,MIL,25,138,42,31,5,47
...,...,...,...,...,...,...,...,...,...
Paul Pierce,487,37,WAS,73,294,144,46,24,92
Kevin Garnett,488,38,TOT,47,311,77,46,17,46
Andre Miller,489,38,TOT,81,153,284,32,6,104
Vince Carter,490,38,MEM,66,133,79,43,14,43
