In [None]:
import babypandas as bpd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update(
    "livereveal", {
        'width': 1500,
        'height': 700,
        "scroll": True,
})

# DSC 10 Discussion Week 4
---

This notebook is quite long, so we might have to go over some parts quickly. 

The full solutions will be uploaded, so if you miss something, you can always go back and check.

# Practice With Merge

Let's create two tables to practice merge on

In [None]:
people = bpd.DataFrame().assign(
    name = ["kyle","jill","cole","alex"],
    age  = [24,22,21,24],
    city = ["San Diego","LA","San Francisco","Irvine"]
    )
people

In [None]:
cities = bpd.DataFrame().assign(
    name = ["San Diego", "LA", "San Francisco","Denver","New York"],
    popular_food  = ["California Burrito", "Tacos", "Sourdough", "Denver Omelete", "Cheesecake"]
    )
cities

## How to merge people with cities? How many rows will there be?

In [None]:
people

In [None]:
cities

Let's merge the tables using the column *name* on each side. How many rows will be there?

In [None]:
people.merge(...)

How many rows will there be? How will the common name columns be handled?

Let's merge the tables using the column *city* on the left, and *name* on the right.

In [None]:
people.merge(...)

Dropping the duplicate column

In [None]:
people.merge(cities, left_on='city', right_on='name').drop(columns = "name_y")

## Let's add in another table and try the merge with it

In [None]:
people

In [None]:
birthdays = bpd.DataFrame().assign(
    age = [21,21,22,24],
    importance = ["Legal Drinking Age", "Officially an Adult", "Taylor Swift Song", "Kyle's Age"]
    )
birthdays

In [None]:
people.merge(birthdays, left_on='age', right_on='age')

# Groupby & conditionals, loops on dataset

# Olympic Athletes
---

From kaggle user Randi H Griffin:
>This is a historical dataset on the modern Olympic Games, including all the Games from Athens 1896 to Rio 2016. I scraped this data from www.sports-reference.com in May 2018. The R code I used to scrape and wrangle the data is on GitHub. I recommend checking my kernel before starting your own analysis.
>
>Note that the Winter and Summer Games were held in the same year up until 1992. After that, they staggered them such that Winter Games occur on a four year cycle starting with 1994, then Summer in 1996, then Winter in 1998, and so on. A common mistake people make when analyzing this data is to assume that the Summer and Winter Games have always been staggered.
Content
>
>The file athlete_events.csv contains 271116 rows and 15 columns. Each row corresponds to an individual athlete competing in an individual Olympic event (athlete-events). The columns are:
>
>1. ID - Unique number for each athlete  
>2. Name - Athlete's name  
>3. Sex - M or F  
>4. Age - Integer  
>5. Height - In centimeters  
>6. Weight - In kilograms  
>7. Team - Team name  
>8. NOC - National Olympic Committee 3-letter code  
>9. Games - Year and season  
>10. Year - Integer  
>11. Season - Summer or Winter  
>12. City - Host city  
>13. Sport - Sport  
>14. Event - Event  
>15. Medal - Gold, Silver, Bronze, or NA  


In [None]:
data = bpd.read_csv("data/athlete_events.csv")
data.take(np.arange(3))
# data.columns

# Something familiar, let's assign points to each country
---

Let's say we're assigning points to each country based on the number of Golds, Silvers, and Bronzes they've won.

Medals are with the following amount of points:

<pre>
  Gold    +5 pts
  Silver  +3 pts
  Bronze  +2 pts
  nan     0 pts
</pre>

Let's do it in a different way than the homework 3

Create a function medal_to_points(medal), that returns an integer point for a given medal string.

In [None]:
def medal_to_points(medal):
    ...

In [None]:
# medal_to_points("Gold")   returns 5
# medal_to_points("Bronze") returns 2
# medal_to_points("Arda")   returns 0

In [None]:
def medal_to_points(medal):
    if ...
        return ...
    elif ...
        return ...
    elif ...
        return 2
    else:
        return 0

In [None]:
print(medal_to_points("Gold"))
print(medal_to_points("Bronze"))
print(medal_to_points("Arda"))

In  the design above, we would need an if statement for every type of medal we have. 

We can do a better version, with fewer if statements. (The function will be correct either way).

In [None]:
medals = ["Gold", "Silver", "Bronze"]
points = [5, 3, 2]
medal_dict = dict(zip(medals, points))
medal_dict

In [None]:
def medal_to_points_v2(medal):
    medals = ["Gold", "Silver", "Bronze"]
    points = [5, 3, 2]
    medal_dict = dict(zip(medals, points))
    
    if medal in medal_dict:
        return ...
    else:
        return 0

In [None]:
print(medal_to_points_v2("Gold"))
print(medal_to_points_v2("Bronze"))
print(medal_to_points_v2("Arda"))

Okay, now we need to apply thit function to our table.

What does `apply` return again?  And how will we use what it returns?

In [None]:
# Removing older datapoints to keep it more managable
data_after = data[data.get("Year") >= 2010]

In [None]:
points_column = data_after.get("Medal").apply(medal_to_points)
data_with_points = data_after.assign(Points = points_column)
data_with_points.take(np.arange(10,15))

We don't need all the data in the table, to find the countries with the highest points.

In [None]:
# Select relevant columns
country_points = data_with_points.get(["NOC", "Points"]).sort_values(by = "Points", ascending = False)
country_points

Easy question: how do we find the total amount of points scored by each country?

In [None]:
# Group by country
scores = ...
scores.sort_values(by = "Points", ascending=False)

# Something familiar, top 5 countries entries count?
---

Let's choose 5 countries and only work with their data. We should use the NOC column. We are looking to see how many entries these 5 countries have in total.

In [None]:
included_countries = ["USA", "CHN", "RUS", "GBR", "GER"]

We have already added points to the entire dataset based on the Medal placement, so let's just get our countries from that `data_with_points` table.

## Solution #1 with apply

In [None]:
def in_included(country): # function to use in apply()
    return ...
print(in_included("USA"))
print(in_included("CAN"))

In [None]:
# Solution #1
countries = data_with_points[...]
countries.take(np.arange(5))

Now that we have only the relevant countries, let's get the sum of their points

In [None]:
countries.groupby("NOC").sum().get("Points").sort_values(ascending = False)

## Solution #2 with merge

Let's create an empty table with the country names we wan't

In [None]:
inc_countries = bpd.DataFrame().assign(NOC = included_countries)
inc_countries

In [None]:
countries = data_with_points.merge(...)
countries.take(np.arange(5))

In [None]:
countries.groupby("NOC").sum().get("Points").sort_values(ascending = False)

## Groupby with multiple columns

For each country (among all countries), for each sport played by that country, count how many data points we have.

|NOC|Sport|# Entries|
|--------|--------|--------|
|USA|Basketball |10 |
|USA|Swimming |5 | 
|USA|Curling |0 |
|CAN|Swimming |0 | 
|CAN|Curling |6|
|...|... |...|

etc.

In [None]:
( 
    data_with_points ...
                  ...
                  ...
                  .reset_index()
                  ... # NOC, Sport, ID
)

# Probability and Experiments

## Prob. Problem 1

Easy question probability wise: We throw a fair die (meaning all outcomes are equally likely). What is the probability that the resulting number is divisible by 3.

In [None]:
# required outcomes / # all outcomes
# required outcomes: 0, 3, 6
# all outcomes: 0, 1, 2, 3, 4, 5
...

How about a d20 dice (20 sided dice, sides ranging from 1 to 20) Let's count it with python:

In [None]:
die_values = range(1,20 + 1)
[i for i in die_values]

In [None]:
count = 0
for ...
    if ...
        print(side, end=', ')
        count += 1
count

Let's find the probability according to our counts

In [None]:
total_count = len(die_values)
div3_probability = ...
div3_probability

For many problems we face, we can't plausibly count all the outcomes this way. 

In such cases we have to run experiments to decide on empricial probability values.

 Let's try to find the result we just obtained experimentally, without counting every possible outcome.

In [None]:
import random

die_values = range(1,20 + 1)
def div_by3_experiment(runs):
        samples = np.random.choice(...)
        relevant_outcomes = ...
        return sum(relevant_outcomes) 

In [None]:
div_by3_experiment(1)

In [None]:
div_by3_experiment(10)

In [None]:
div_by3_experiment(100)

In [None]:
runs = 10_000
empirical_prob = div_by3_experiment(runs) / runs
empirical_prob

In [None]:
runs = 1_000_000
empirical_prob = div_by3_experiment(runs) / runs
empirical_prob

As you can see, we are getting closer and closer to the correct value of 0.3.

(In this case, we know the correct value by our precise counting from earlier).

## Prob. Problem 2

Different problem, we are rolling *two* D6 dice (each has 6 sides). 

What is the probability that the sum of the numbers will be even?

Let's first think mathematically and count the results. 

To have an even number, we should either add two even numbers, or two odd numbers.

Two possibilities:
* Both are even 3 * 3 = 9 possibilities (0,2,4)
* Both are odd  3 * 3 = 9 possibilities (1,3,5)


9 + 9 = 18 possibilities (half of the total 36)

Let's use python to confirm by counting

In [None]:
die_values = range(1,6+1)

def even_count(die_values):
    count = 0
    for die1 in die_values:
        for die2 in die_values:
            dice_sum = ...
            count += ...
    return count

even_count(die_values)

In [None]:
total_outcomes = ...
probability = even_count(die_values) / total_outcomes
probability

What if both dice were D20 (20 sides):

We are looking for sums that are even and greater than or equal to 30

In [None]:
def even_count_modified(die_values, lower_lim):
    count = 0
    for die1 in die_values:
        for die2 in die_values:
            dice_sum = die1 + die2
            if ...
                count += dice_sum % 2 == 0
    return count

die_values = range(1, 20+1)
lower_limit = 30
event_count = even_count_modified(die_values, lower_limit)
event_count

In [None]:
total_outcomes = len(die_values) ** 2
probability = ...
probability

Let's try to get this result experimentally

The sum of the dice should be even, and larger than or equal to lower_lim

In [None]:
def even_exp(die_values, runs, lower_lim):
        die1_samples = np.random.choice(die_values, runs, replace=True)
        die2_samples = np.random.choice(die_values, runs, replace=True)
        dice_sum = die1_samples + die2_samples
        relevant_outcomes = ...
        return sum(relevant_outcomes) 

In [None]:
runs = 1_000_000
e_count = even_exp(die_values, runs, lower_limit)
probability = e_count / runs
probability

That's pretty close!

# One last thing before we go...

Warning: Be careful of the difference between:
* logical: and, bitwise: &
* logical: or,  bitwise: |

Let's go over their difference

In [None]:
True and False

In [None]:
np.array([True, False, False]) or np.array([False, True, False])

In [None]:
np.array([True, False, False]) | np.array([False, True, False])

* Bitwise operations work compare each element of two boolean arrays
* Logical operations compare two single boolean values (not arrays)