In [None]:
import babypandas as bpd
import numpy as np

In [None]:
df = bpd.read_csv('data/player_data.csv')
df

---

# Top Ten Table Patterns

## And some variations

Let's look at the most common patterns we have been using on tables. They are quite simple when you have a computer. 

However, for the exam, you really need to get familiar with them.

Best way to study: Study by writing code with pen and paper. Learn to check your code for logical and syntax errors, without the help of Python!

# 0) Get and Drop Columns

**Pattern**: `df.get(column_name)`

**Pattern**: `df.drop(columns = column_name)`

Where column_name is a string

In [None]:
df.get('Points')

In [None]:
df.get('Age')

### Drop column "Age"

In [None]:
df_modified = df.drop("Age")

In [None]:
df_modified = ...
df_modified

### drop columns Age, Team and Games

In [None]:
df_modified = ...
df_modified

# 1) Get something by its label & index

**Example**: how many points did LeBron James have?

**Pattern**: `df.get(column_name).loc[row_label].`

### Getting data by its label

In [None]:
df = df.set_index('Name')

### Get the points of the player: LeBron James.

In [None]:
...

### Get the Games of the player: Chris Paul.

In [None]:
...

### Getting Multiple datapoints by their labels

Get the points of players James Harden, Stephen Curry, Adreian Payne

In [None]:
query_players = ["James Harden", "Stephen Curry", "Adreian Payne"]
...

# 2) Find the label with the largest/smallest value.

**Example**: Player with the most points?

**Pattern**: `df.sort_values(by = "Points").iloc[-1]`

**Pattern**: `df.sort_values(by = "Points", ascending = False).iloc[0]`

### According to score, get the point and name of best player

In [None]:
df.get("Points").sort_values()

In [None]:
df = df.sort_values()

In [None]:
df = df.sort_values(by = "Points") # ascending

In [None]:
# Wrong
df.get('Points').iloc[0]

In [None]:
# Correct
df.get('Points').iloc[-1]

In [None]:
df.index[-1]

### Get the name and age of 5th oldest player

In [None]:
... # sorting

In [None]:
... # get the age

In [None]:
... # get the name

### Get the names & ages of the oldest 5 players

In [None]:
query_range = np.arange(0, 4+1)

(df
    .get('Age')
    .sort_values(ascending = False)
    .iloc[query_range]
)

### Get the name of the oldest player, who also has the most number of points.

In [None]:
...

In [None]:
df.index[-1]

### Get the age and points of this player

In [None]:
df.get("Age").iloc[-1]

In [None]:
df.get("Points").iloc[-1]

---

# 3) Compute a statistic for a subset. Filter to get the subset.

**Example**: Players info for players with age >= 30. For example the mean points.

**Pattern**:

`bool_mask = df.get('Age') >= 30
df[bool_mask].get("Points").mean()
`

### Return a table containing entries for players with age >= 30

In [None]:
bool_mask = df.get('Age') >= 30
df[bool_mask]

### Get the mean points for players with age >= 30

In [None]:
bool_mask = df.get('Age') >= 30
df[bool_mask].get('Points').mean()

### Calculate the mean of points. Get the mean age of the players, who have scored higher than or equal to the mean points.

In [None]:
...

# 4) Combining Conditions, Filtering and Getting Statistics

**Example**: Players with more than 600 assists and 100 steals

**Pattern**:

`mask1 = df.get('Assists') > 600
mask2 = df.get('Steals') > 100
bool_mask = mask1 & mask2
df[bool_mask]
`

**Pattern**: Don't forget the parantheses if you write it like below:

`
df[(...) & (...) & (...)]
df[(df.get('Assists') > 600) & (df.get('Steals') > 100)]
`

### Filter the table, players who have more than 600 assists and more than 100 steals

In [None]:
...
df[bool_mask]

### How many players have more than 300 rebounds and more than 20 blocks?

In [None]:
mask1 = df.get('Rebounds') > 1000
mask2 = df.get('Blocks') > 100
bool_mask = mask1 & mask2
...

In [None]:
### Who are these players?

In [None]:
bool_mask = mask1 & mask2
df[bool_mask].index

### Calculate the median age of a subset of the players.

This subset includes, the players who scored higher than the mean points, and played more than 40 games.

In [None]:
mean_points = ...
mask1 = ...
mask2 = ...
bool_mask = mask1 & mask2
df[bool_mask] #...

# 5) Compute statistics for a group. 

**Pattern**:

`df.groupby(column_name).func()
`
Where func is the aggrageting function

### Get the total points sum for each team.

In [None]:
...

### How many players does each team have?

In [None]:
df.groupby('Team').count().get(["Points"])

### Get the mean number of games & mean points for each team

In [None]:
...

### Get the sum of Points per Game for each team

In [None]:
new_col = ... # Points per game for each player
df_new = (df
    .assign(Points_Per_Game = new_col)
    .sort_values(by = "Points_Per_Game",ascending = False)
)
df_new

In [None]:
# sum points per game for each team
...

# 6) Apply function & Conditionals

**Pattern**: `df.get(a_column).apply(a_function)`

### Given a full name, write a function that finds how many words it has

In [None]:
def find_name_len(string):
    """ Finds how many words the name contains """
    ...

In [None]:
find_name_len("Arda Cankat Bati")

In [None]:
find_name_len("Tony Montana")

### Apply your function to the Name column

In [None]:
df_temp = df.reset_index()
df_temp ...

### Find the longest full name (by number of words in it)

In [None]:
( df
 .reset_index()
 .get("Name")
 .apply(find_name_len)
 .max()
)

### Write a function that assigns an age group to a given age
* age less than 20 --> "young"
* age less than 30 --> "mid"
* age more than or equal 30 --> "old"

I am not saying 30 is old! :)

In [None]:
def assign_age_group(age):
    ...

In [None]:
assign_age_group(19)

In [None]:
assign_age_group(35)

### Add a new column to the table, which shows the age group of each player

In [None]:
new_col = ...
df_new = df.assign(Age_Group = new_col)
df_new

# 7) Groupby Multiple Columns and look at statistics


**Pattern**:

`df.groupby([column_name1, column_name2]).func()
`
Where func is the aggrageting function

* There should always be an aggregating function. Otherwise we just get a groupby object.
* Don't forget to use reset_index() after grouping

### Get the number of players in each team and in each age group

In [None]:
(df_new # has the additional Age_Group column
 ...
 ...
)

In [None]:
# Get only the Team, Age_Group and Games columns of the result

In [None]:
df_groups = (df_new
 .groupby(["Team", "Age_Group"])
 .count()
 .get(["Team", "Age_Group", "Games"])
)
df_groups

In [None]:
df_groups = (df_new
 .groupby(["Team", "Age_Group"])
 .count()
 .reset_index()
 .get(["Team", "Age_Group", "Games"])
)
df_groups

# 8) Rename a column

**Pattern**: Store the column to be renamed, assign it to with a new column name, drop the column with the old name.

`new_col = df.get(old_column_name)
df = (df.assign(new_col_name = new_col).drop(columns = old_column_name))`

In [None]:
df_groups

### Rename the Games column to Player_Count

In [None]:
save_col = ...
df_groups = ...

# 9) Get all rows containing a string.

**Example** Find all players who have the word "James" somewhere in their full name.

**Pattern**

`bool_mask = df.get(column_of_strings).str.contains('James')
df[bool_mask]
`

### Filter the table so that only players with the substring "James" in their full name remain.

In [None]:
...

### Filter the table so that only players with the substring "gg" in their full name remain.

In [None]:
...

### Only players with the substring "Reg" and substring "ie" in their full name remain.

In [None]:
...

### Why did we have just 9 points instead of 10?

### Because we had 0 indexing :)

# Top 8 Possible Pitfalls & Things to Keep in Mind

## 0) Difference between & and "and"

Always use "and" with conditionals, always use & with boolean arrays.

In [None]:
True and False

In [None]:
np.array([True, False, True]) & np.array([False, False, False])

## 1) Parentheses when combining conditionals:

In [None]:
df[df.get("Age") >= 25 & df.get("Points") >= 2000]

In [None]:
df[(df.get("Age") >= 25) & (df.get("Points") >= 1800)]

## 2) Column names are meaningless after a `groupby` and count!

In [None]:
df.groupby("Team").count()

In [None]:
# Has no relation to the actual "Steals" and "Blocks" columns
df.groupby("Team").count().get(["Steals", "Blocks"])

## 3) Reset index, especially after grouping with multiple columns.

In [None]:
(df_new
 .groupby(["Team", "Age_Group"])
 .count()
 .reset_index()
)

## 4) `iloc[]` vs `loc[]` vs array indexing`[]`

In [None]:
# Before using loc, make sure of what type of index you have:
df.index

In [None]:
df = df.set_index("Name")

In [None]:
df.get("Age").loc["Stephen Curry"]

In [None]:
df.get("Age").iloc[2]

In [None]:
df.index[2]

## 5) Not specifying column while sorting table

Wrong: `df = df.sort_values(ascending = False)` 

Correct: `df = df.sort_values(by = column_name, ascending = False)` 

## 6) Trying to get the index using .get() instead of .index

In [None]:
# df.get("Name")
df.index

# 7) Using df.drop with missing argument

`df.drop(columns = column_name)` without columns, for example `df.drop(column_name)` is wrong.

In [None]:
# df.drop("Points")
df.drop(columns = "Points")