# Class08 Examples

# Review of Tricky Table Operations

In [None]:
from datascience import *
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

## Load a table from a file

In [None]:
file_path = "data/BenJerryData.csv"
ice_cream = Table().read_table(file_path)
ice_cream

## Running a statistic on a table column
A table is comprised of columns. Each column has a label and then an array of data. We use the .column() method to extract the data from a table column.

### Task: Find the average protein content

First pull out the data array.

In [None]:
protein = ice_cream.column('protein (g)')
protein

Once you have the array, you can using any of the numpy functions.

In [None]:
avg_protein = np.average(protein)
max_protein = np.max(protein)
min_protein = np.min(protein)

print(f" The max protein is {max_protein}, the min is {min_protein}, the average is {avg_protein} grams")

## Returning the top value of a column
### Task: Print the flavor with the highest calorie content

In [None]:
# Step 1: Sort the table
ice_cream_sorted = ice_cream.sort('Calories (kcal)', descending=True)
ice_cream_sorted

In [None]:
# Step 2: Extract the array for the column of interest
sorted_flavors = ice_cream_sorted.column('Flavor')
sorted_flavors

In [None]:
# Step 3: Pull out the first element of this array.
sorted_flavors.item(0)

In [None]:
# Note: This steps can be combined into one operation
# But honestly, I wouldn't chain operations until you really understand each step
ice_cream.sort('Calories (kcal)', descending=True).column('Flavor').item(0)

## Filtering rows in a table based on a condition
### Task: Create a table with only the flavors of ice cream that have fewer then 25 grams of fat.

In [None]:
# Need to filter? Use .where with the appropriate predicate
ice_cream.where('fat (g)', are.below(25))

## Adding a column to a table based on a mathematical combination of other columns
### Task: Add a column that is the ratio of protein to carbohydrates (I'm not saying this is nutritionally useful).
The steps are:
- Extract the data arrays from the columns
- Do the math
- Put the result in a new column

In [None]:
# Extract the arrays
protein = ice_cream.column('protein (g)')
carbs = ice_cream.column('carbohydrates (g)')

# Do the math
ratio = protein / carbs

# Create the new column
ice_cream_with_ratio = ice_cream.with_columns('protein/carbs', ratio)
ice_cream_with_ratio

In [None]:
# Again, the operations can be combined into a single line.
# You don't need to do this, but you should understand it.
ice_cream_with_ratio2 = ice_cream.with_columns('protein/carbs)', ice_cream.column('protein (g)') / ice_cream.column('carbohydrates (g)'))
ice_cream_with_ratio2

**Bonus question: What are the units of the protein/carbs ratio?**

# Functions
Thousands of functions are built into the Python computer language and still others can be loaded by using the `import` Python command. This is very powerful and provides almost limitless capability to the Python language. However, there are many times when a custom function may be needed and this is a very powerful way to automate repetitive data handling and analysis tasks in a reproducible manner. Functions take arguments given in paretheses *()* directly following the name. For instance below is the built-in Python print function:

## Adding a table column based on a calculation using other columns

### Task: Add a column to the ice_cream table 

In [None]:
def double(x):
    """ doubles """
    return 2*x

In [None]:
help(double)

In [None]:
def triple(xtra):
    """ triples """
    return 3*xtra

In [None]:
x = double(4)*triple(4)
x

# Applying functions to Tables

Suppose we were give a table of x1, y1, x2, y2 coordinates pairs.

In [None]:
from datascience import *
import numpy as np
%matplotlib inline

x1 = make_array(1, 3, 6, 0)
x2 = make_array(2, 4, 1, 4)
y1 = make_array(3, 5, 1, 0)
y2 = make_array(3, 2, 4, 3)
coordinate_pairs = Table().with_columns("x1", x1, "y1", y1, "x2", x2, "y2", y2)
coordinate_pairs

Now suppose we want to add a column that contain the distance between the (x1, y1) and (x2, y2) pairs.

$$ distance = \sqrt{(x_1 - x_2)^2 + (y_1 - y_2)^2} $$

FIRST: Create a function to calculate distance.

In [None]:
def distance(x1, y1, x2, y2):
    """ 
    This function takes two pairs of (x,y) coordinates
    and calulates the Euclidean distance using the
    Pythagorean theorem.
    """
    delta_x = x1 - x2
    delta_y = y1 - y2
    dist = np.sqrt(delta_x**2 + delta_y**2)
    return dist

In [None]:
# Test our function with a 3, 4, 5 triangle
x1 = 0
x2 = 4
y1 = 0
y2 = 3
distance(x1, y1, x2, y2) # result should be 5

#### We could do this the way we've learned before:

In [None]:
X1 = coordinate_pairs.column('x1') # extract data array for column
X2 = coordinate_pairs.column('x2')
Y1 = coordinate_pairs.column('y1')
Y2 = coordinate_pairs.column('y2')
dist = distance(X1, X2, Y1, Y2) # calculate distance
new_table = coordinate_pairs.with_columns("distance", dist) # add distance to Table
new_table


#### But more effective to use "Apply" to compute new Table column

In [None]:
coordinate_pairs = coordinate_pairs.with_columns(
    "distance", coordinate_pairs.apply(distance, 'x1', 'y1', 'x2', 'y2')
)

coordinate_pairs

# Visualize Table Data

### Scatter depicts relationship between two variables

In [None]:
coordinate_pairs.scatter('x1', 'y1')

## Histograms

In [None]:
# Instead of using real names, we will use a python package that generates fake name.
# This is an excellent way to anonymize personal data.

!pip install faker

In [None]:
from faker import Faker
fake = Faker()
fake.name()

In [None]:
names = []
for i in np.arange(28):
    names.append(fake.name())
names = np.array(names)
names

In [None]:
scores = make_array(
    1.00,
    3.50,
    2.50,
    4.00,
    6.00,
    5.50,
    4.00,
    2.50,
    2.50,
    2.00,
    1.00,
    3.50,
    4.50,
    3.00,
    1.00,
    4.50,
    5.00,
    6.00,
    4.00,
    5.00,
    3.50,
    4.00,
    3.00,
    2.00,
    3.00,
    6.00,
    4.00,
    4.00
)

scores

In [None]:
quiz2 = Table().with_columns("Name", names, "Score", scores)
quiz2

In [None]:
help(quiz2.stats)

In [None]:
quiz2.stats((np.min, np.max, np.mean, np.median, np.std))

In [None]:
quiz2.hist('Score')

## To understand the units on the vertical scale
Read: https://www.stat.berkeley.edu/~stark/SticiGui/Text/histograms.htm

# Understanding function documentation
http://www.data8.org/datascience/_autosummary/datascience.tables.Table.hist.html

In [None]:
# Try one of the options shown in the documentation.
quiz2.hist('Score', bins = [0, 1, 2, 3, 4, 5, 6])

## Bar plot

In [None]:
quiz2.bar('Name','Score')

## When we need to customize the plot we have to reach into the underlying matplotlib plotting library.

In [None]:
quiz2.bar('Name','Score', width=20)
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha='right')

In [None]:
quiz2.bar('Name','Score', width=20)
ax = plt.gca()
ax.set_xticks(np.arange(len(quiz2.column('Name'))))
ax.set_xticklabels(names, rotation=90, ha='right');

In [None]:
quiz2_sorted = quiz2.sort('Score', descending=True)
quiz2_sorted

In [None]:
quiz2_sorted.bar('Name','Score', width=20)

In [None]:
quiz2_sorted.bar('Name','Score', width=20)
names = quiz2_sorted.column('Name')
ax = plt.gca()
ax.set_xticks(np.arange(len(quiz2.column('Name'))))
ax.set_xticklabels(names, rotation=90, ha='right');