# Week 05

## Setup

Include some helper functions and libraries

In [None]:
!wget -q https://github.com/DM-GY-9103-2024S-R/9103-utils/raw/main/src/io_utils.py

In [None]:
import matplotlib.pyplot as plt
import random

from io_utils import object_from_json_url

## Lists

### Create a list from sequence of numbers

<img src="./imgs/range.jpg" width="500px">

In [None]:
list_from_sequence = []

for i in range(0, 10):
  list_from_sequence.append(i)

print(list_from_sequence)

In [None]:
# Or, even better
list_from_sequence = list(range(0, 10))
print(list_from_sequence)

### List functions

Members of each `list` object.

<img src="./imgs/lists_00.jpg" width="500px">

### Create a list of 10 random numbers between 0 and 2000

In [None]:
list_of_randoms = []

for i in range(0, 10):
  n = random.randint(0, 2000)
  list_of_randoms.append(n)

print(list_of_randoms, "length: ", len(list_of_randoms))

### Addition

Besides the `append()` function we can also add elements to a list by using the `+` operator to *concatenate* two lists.

And just like addition on numbers we have to assign the result to a variable in order to use it later:

In [None]:
list_of_randoms = list_of_randoms + [random.randint(0, 2000)]
print(list_of_randoms, "length: ", len(list_of_randoms))

# Or, with the +=
list_of_randoms += [random.randint(0, 2000), random.randint(0, 2000)]
print(list_of_randoms, "length: ", len(list_of_randoms))

### Slicing

Python has a built-in mechanism for getting sub-sections of a list called *slicing*.

Instead of a single index, we specify two values in the square bracket, separated by a `:`, to specify where our slice starts and ends:

<img src="./imgs/slicing.jpg" width="700px">

One **VERY** important thing to remember is that the second index in the bracket is **NOT** included in the slice.

And, Python being Python, it tries to be smart and keep us from unnecessary typing:
- if the first index is blank, the slice will start at the first element 
- if the second index is blank, the slice will go until the end of the list

In [None]:
# The list
print(list_of_randoms)

# First 5 items
print(list_of_randoms[ : 5])

# Last 5 items
print(list_of_randoms[-5 : ])

# The 5 items at indexes [3] - [7]
print(list_of_randoms[3 : 8])

# The 5 items in the center
center_index = len(list_of_randoms) // 2
print(list_of_randoms[center_index - 2 : center_index + 3])

### Functions on lists

These are functions that Python gives us to work on lists.

<img src="./imgs/lists_01.jpg" width="600px">

### There are functions for sorting, reversing and getting the length of a `list`:

In [None]:
my_sorted_list = sorted(list_of_randoms)

print(my_sorted_list, "\nlength: ", len(my_sorted_list))

In [None]:
my_reversed_sorted_list = list(reversed(sorted(list_of_randoms)))

print(my_reversed_sorted_list, "\nlength: ", len(my_reversed_sorted_list))

### Sort & Slice

With a sorted list we can use slice indexing to get its $5$ smallest and $5$ largest elements:

In [None]:
print(my_sorted_list[ : 5])
print(my_sorted_list[-5 : ])

## Objects/Dictionaries

### Creating objects

In [None]:
my_info = {
  "name": "thiago",
  "id": 8114,
  "zip": 11001,
  "grades": [90, 80, 60],
  "attendance": [True, True, False, True, True],
  "final grade": "A"
}

my_info

### Iterating over key/values

<img src="./imgs/objects.jpg" width="500px">

In [None]:
print("Iterate Keys:")
for key in my_info.keys():
  print("key:", key)

print("\nIterate Values:")
for val in my_info.values():
  print("value:", val)

In [None]:
print("Iterate Keys and Values:")
for key, val in my_info.items():
  print("key", key, "has value", val)

### Accessing values at specific keys

In [None]:
my_grades = my_info["grades"]
print(my_grades)

In [None]:
# Sum grades
sum_grades = sum(my_grades)
print(sum_grades)

### Average

<img src="./imgs/average_00.jpg" width="500px">

<img src="./imgs/average_01.jpg" width="500px">

In [None]:
# Average grade

my_grades = my_info["grades"]
print(my_grades)

my_grades_average = sum(my_grades) / len(my_grades)
print(my_grades_average)

In [None]:
# Average attendance

my_attendance = my_info["attendance"]
print(my_attendance)

my_attendance_average = sum(my_attendance) / len(my_attendance)
print(my_attendance_average)

### List of objects

Creating a list of 10 objects with random heights and brooklyn zip codes.

In [None]:
my_data = []

for c in range(0, 10):
  my_object = {
    "height": random.randint(60, 70),
    "zip": random.randint(11200, 11240)
  }

  my_data.append(my_object)

for d in my_data:
  print(d)

### Get list of all heights

In [None]:
heights = []

for d in my_data:
  # TODO
  heights.append(d["height"])

heights

### Sort by key values

For example, sort objects by zip code.

We could first get all the zip codes and then sort the new list:

In [None]:
zips = []

for d in my_data:
  zips.append(d["zip"])

print("original:")
print(zips)

by_zip = sorted(zips)

print("sorted:")
print(by_zip)

### But now we don't have the associated heights with each zip code.

We want to sort the list while keeping the objects together.

Would be nice to be able to do something like this, just like with a `list`:

In [None]:
by_zip = sorted(my_data)
print(by_zip)

### Sorting Objects

For lists of objects we have to tell python which values to compare to determine their order.

We do this by defining a key function.

<img src="./imgs/list_of_objects.jpg" width="620px">

In [None]:
def zipKey(A):
  return A["zip"]

by_zip = sorted(my_data, key=zipKey)

by_zip

In [None]:
# TODO: sort by height
def heightKey(A):
  return A["height"]

by_height = sorted(my_data, key=heightKey)

by_height

### Min/Max functions also work with key argument

In [None]:
max_by_zip = max(my_data, key=zipKey)

print(max_by_zip)

min_by_height = min(my_data, key=zipKey)

print(min_by_height)

## Bigger Lists

In [None]:
HEIGHT_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024S-R/9103-utils/main/datasets/json/ansur_height.json"

# Use the object_from_json_url() function to get the contents
# from the json file into a Python object called "heights"

heights = object_from_json_url(HEIGHT_FILE)

# TODO: look at the data

# Answer the following:
#   - how many items ?
#   - what's the biggest height ?
#   - what's the smallest height ?
#   - what's the average height ?

In [None]:
# number of items
len(heights)

In [None]:
# smallest and largest
min(heights), max(heights)

In [None]:
# average
sum(heights) / len(heights)

In [None]:
WEIGHT_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024S-R/9103-utils/main/datasets/json/ansur_weight.json"
weights = object_from_json_url(WEIGHT_FILE)

# TODO: look at the data

# Answer the following:
#   - how many items ?
#   - what's the biggest ?
#   - what's the smallest ?
#   - what's the average ?

num_items = len(weights)
max_weight = max(weights)
min_weight = min(weights)
avg_weight = sum(weights) / len(weights)

num_items, min_weight, max_weight, avg_weight

In [None]:
AHW_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024S-R/9103-utils/main/datasets/json/ansur_age_height_weight.json"
ahws = object_from_json_url(AHW_FILE)

# TODO: look at the data

# Answer the following:
#   - how many items ?
#   - how is this data different than the others ?
#   - what's the biggest age ?
#   - what's the smallest age ?
#   - what's the average age ?
#   - how do we access the height of a person ?
#   - how do we get all of the heights ?

num_items = len(ahws)

# age statistics
def ageKey(A):
  return A[0]

sorted_age = sorted(ahws, key=ageKey)

max_age = sorted_age[-1][0]
min_age = sorted_age[0][0]

# all ages
ages = []
for p in ahws:
  ages.append(p[0])

avg_age = sum(ages) / len(ages)

# all heights
heights = []
for p in ahws:
  heights.append(p[1])


ahws[:5], num_items, min_age, max_age, avg_age, ahws[0][1], heights[:3]

## List of Lists

Just like we can put lists inside objects, we can also put lists inside lists.

If we want to get to a particular value we have to use $2$ indices instead of using just one:
`list[i][j]`

The first index tells Python which of the sub-lists we want, and the second specifies the item on that list.

<img src="./imgs/list_of_lists_00.jpg" width="700px">

<img src="./imgs/list_of_lists_01.jpg" width="700px">

Sometimes we'll refer to the first index as the row index and the second index as the column index.

That's because if we imagine our list of lists as a 2-dimensional matrix of numbers, the first index tells Python which row we want to access and the second tells which column:

<img src="./imgs/list_of_lists_02.jpg" width="700px">

<img src="./imgs/list_of_lists_03.jpg" width="700px">

### Datasets

We'll see this kind of structure a lot.

It's very common for datasets to be organized by rows/columns, where each column specifies a different *property* (or *feature*) and each row is a different *measurement* (or *record*) of those features.

In our example above, our dataset had $3$ *features* (age, height, weight), and one *record* per person.

<img src="./imgs/datasets_00.jpg" width="700px">

### JSON

It's also common to find datasets specified in the JSON format.

Instead of just being a list of lists with values, each *record* is an object that specifies the names and values of its *features*:

<img src="./imgs/datasets_01.jpg" width="700px">

There are advantages and disadvantages to each. We'll soon look at another way to organize datasets that will make it easier to go from one type to the other if we have to.

In [None]:
AHW_OBJ_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024S-R/9103-utils/main/datasets/json/ansur_age_height_weight_object.json"
ahw_objs = object_from_json_url(AHW_OBJ_FILE)

# TODO: look at the data

# Answer the following:
#   - how is this data different than the others ?
#   - how many items ?
#   - how do we access the height of a person ?

ahw_objs[:5], len(ahw_objs), ahw_objs[0]["height"]

In [None]:
# more complete json version

ANSUR_OBJ_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024S-R/9103-utils/main/datasets/json/ansur.json"
ansur = object_from_json_url(ANSUR_OBJ_FILE)

# TODO: look at the data

# Answer:
#   - how many rows/records/items ?
#   - how many columns/features/properties ?
#   - what could be an interesting feature ?

ansur[:3]

## Plots

We can use the [matplot](https://matplotlib.org/stable/api/pyplot_summary.html) library to visualize our data.

In [None]:
plt.plot(heights, 'bo', markersize=2)
plt.show()

In [None]:
# TODO: plot weights

plt.plot(weights, 'bo', markersize=2)
plt.show()

In [None]:
# TODO: plot sorted heights

sorted_heights = sorted(heights)
plt.plot(sorted_heights, 'bo', markersize=2)
plt.show()

## Histograms

In [None]:
plt.hist(heights, bins=20)
plt.show()

## Correlation

Measurement of how $2$ independent variables (measurements) are related to each other.

<img src="./imgs/correlation.jpg" width="800px">

They can have *positive* or *direct* correlation, if an increase in one of the variables comes with an increase in the other.

They can have *negative* or *inverse* correlation if an increase in one of the variables is accompanied by a decrease in the other.

Or, there can be *weak* or *NO* correlation, if a change in one variable doesn't seem to be accompanied by a change in the other.

In [None]:
# TODO: get "columns" from the data above and plot scatter plot
x = []
y = []
z = []

for p in ahw_objs:
  x.append(p["height"])
  y.append(p["weight"])
  z.append(p["age"])

plt.scatter(x, y, marker='o')
plt.xlabel("height")
plt.ylabel("weight")
plt.title("weight x height")
plt.show()

plt.scatter(z, x, marker='o')
plt.xlabel("age")
plt.ylabel("height")
plt.title("height x age")
plt.show()

plt.scatter(z, y, marker='o')
plt.xlabel("age")
plt.ylabel("weight")
plt.title("weight x age")
plt.show()