# 0. Load imports 

In [30]:
## imports
import pandas as pd
import numpy as np
import re

## print multiple things from same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## load data on 2020 crimes in DC
df = dc_crim_2020 = pd.read_csv("https://opendata.arcgis.com/datasets/f516e0dd7b614b088ad781b0c4002331_2.csv")

# 1. Questions: list comprehension

- In class example, why did we need the "courses" at the beginning of the list iteration
- How did the join syntax work in the example where we paste together offenses from same ward

In [37]:
## toy example

### pool of courses
all_courses = ["QSS20", "QSS17", "GOV10", "GOV4", "COSC1"]


## 1.1 Application 1: filtering to a smaller list

When we might use: have a lot of columns in a dataframe; want to filter to a smaller set using some pattern

In [32]:
### pull out ones that contain GOV in the string
gov_c = [course for course in all_courses
        if "GOV" in course]
gov_c # result

['GOV10', 'GOV4']

In [33]:
### showing that the "course" is just a placeholder/
### arbitrary interator
gov_c_alt = [x for x in all_courses if "GOV" in x]

gov_c == gov_c_alt

True

In [58]:
"GOV20"[:3]

'GOV'

## 1.2 Application two: keep all objects in the list but do some transformation

In [38]:
all_courses

## strip the numbers from the course names
courses_prefix = [x[:3] for x in all_courses]
courses_prefix # could then find unique elements


['QSS20', 'QSS17', 'GOV10', 'GOV4', 'COSC1']

['QSS', 'QSS', 'GOV', 'GOV', 'COS']

In [39]:
# Join all together example
" #:)# ".join(courses_prefix)

'QSS #:)# QSS #:)# GOV #:)# GOV #:)# COS'

#### Your turn: Using original list, add "dartmouth_" prefix to the course name

In [59]:
dart_course = ["dartmouth_" + course for course in all_courses]
dart_course

['dartmouth_QSS20',
 'dartmouth_QSS17',
 'dartmouth_GOV10',
 'dartmouth_GOV4',
 'dartmouth_COSC1']

In [68]:
b = "#dartmouth_".join(["#"] + all_courses)
b.split("#")

['',
 '',
 'dartmouth_QSS20',
 'dartmouth_QSS17',
 'dartmouth_GOV10',
 'dartmouth_GOV4',
 'dartmouth_COSC1']

## 1.3 Subsetting columns

Use list comprehension to filter to columns with id in the string. Then, create a new dataframe called df1 that contains only column heads with "id"

In [40]:
id_cols = [col for col in df.columns if "ID" in col]
id_cols

## Then, filter the data
df[id_cols]

['BID', 'OBJECTID', 'OCTO_RECORD_ID']

Unnamed: 0,BID,OBJECTID,OCTO_RECORD_ID
0,,499862475,
1,,499862478,
2,,499862479,
3,,499862481,
4,,499862484,
...,...,...,...
27927,,500401564,
27928,,500401587,
27929,,500401591,
27930,,500401595,


## 1.4 Comprehension for numbers

Here we compare two ways of creating a list of even numbers.

In [17]:
num_list = np.arange(10000)
num_list

array([   0,    1,    2, ..., 9997, 9998, 9999])

In [18]:
%%time
even_nums = [i for i in num_list if (i % 2) == 0]

CPU times: user 10.2 ms, sys: 1.72 ms, total: 12 ms
Wall time: 12.6 ms


In [45]:
~(num_list % 2).astype(bool)

array([ True, False,  True, ..., False,  True, False])

In [21]:
%%time
num_list[~(num_list % 2).astype(bool)]

CPU times: user 1.09 ms, sys: 318 µs, total: 1.41 ms
Wall time: 1.14 ms


array([   0,    2,    4, ..., 9994, 9996, 9998])

#### Your turn: Extract all numbers in num_list that end in 7

In [71]:
%%time
inc_7 = [num for num in num_list if str(num)[-1] == "7"]

CPU times: user 13.7 ms, sys: 486 µs, total: 14.2 ms
Wall time: 14.6 ms


In [74]:
%%time
seven_list = [num for num in num_list if num % 10 == 7]

CPU times: user 9.85 ms, sys: 472 µs, total: 10.3 ms
Wall time: 10.6 ms


In [77]:
%%time
num_list[ (num_list % 10) == 7 ]

CPU times: user 371 µs, sys: 96 µs, total: 467 µs
Wall time: 659 µs


array([   7,   17,   27,   37,   47,   57,   67,   77,   87,   97,  107,
        117,  127,  137,  147,  157,  167,  177,  187,  197,  207,  217,
        227,  237,  247,  257,  267,  277,  287,  297,  307,  317,  327,
        337,  347,  357,  367,  377,  387,  397,  407,  417,  427,  437,
        447,  457,  467,  477,  487,  497,  507,  517,  527,  537,  547,
        557,  567,  577,  587,  597,  607,  617,  627,  637,  647,  657,
        667,  677,  687,  697,  707,  717,  727,  737,  747,  757,  767,
        777,  787,  797,  807,  817,  827,  837,  847,  857,  867,  877,
        887,  897,  907,  917,  927,  937,  947,  957,  967,  977,  987,
        997, 1007, 1017, 1027, 1037, 1047, 1057, 1067, 1077, 1087, 1097,
       1107, 1117, 1127, 1137, 1147, 1157, 1167, 1177, 1187, 1197, 1207,
       1217, 1227, 1237, 1247, 1257, 1267, 1277, 1287, 1297, 1307, 1317,
       1327, 1337, 1347, 1357, 1367, 1377, 1387, 1397, 1407, 1417, 1427,
       1437, 1447, 1457, 1467, 1477, 1487, 1497, 15

#### Your turn: Divide each number  in num_list by 2

In [79]:
num_list_2 = [(x/2) for x in num_list]
num_list_2

[0.0,
 0.5,
 1.0,
 1.5,
 2.0,
 2.5,
 3.0,
 3.5,
 4.0,
 4.5,
 5.0,
 5.5,
 6.0,
 6.5,
 7.0,
 7.5,
 8.0,
 8.5,
 9.0,
 9.5,
 10.0,
 10.5,
 11.0,
 11.5,
 12.0,
 12.5,
 13.0,
 13.5,
 14.0,
 14.5,
 15.0,
 15.5,
 16.0,
 16.5,
 17.0,
 17.5,
 18.0,
 18.5,
 19.0,
 19.5,
 20.0,
 20.5,
 21.0,
 21.5,
 22.0,
 22.5,
 23.0,
 23.5,
 24.0,
 24.5,
 25.0,
 25.5,
 26.0,
 26.5,
 27.0,
 27.5,
 28.0,
 28.5,
 29.0,
 29.5,
 30.0,
 30.5,
 31.0,
 31.5,
 32.0,
 32.5,
 33.0,
 33.5,
 34.0,
 34.5,
 35.0,
 35.5,
 36.0,
 36.5,
 37.0,
 37.5,
 38.0,
 38.5,
 39.0,
 39.5,
 40.0,
 40.5,
 41.0,
 41.5,
 42.0,
 42.5,
 43.0,
 43.5,
 44.0,
 44.5,
 45.0,
 45.5,
 46.0,
 46.5,
 47.0,
 47.5,
 48.0,
 48.5,
 49.0,
 49.5,
 50.0,
 50.5,
 51.0,
 51.5,
 52.0,
 52.5,
 53.0,
 53.5,
 54.0,
 54.5,
 55.0,
 55.5,
 56.0,
 56.5,
 57.0,
 57.5,
 58.0,
 58.5,
 59.0,
 59.5,
 60.0,
 60.5,
 61.0,
 61.5,
 62.0,
 62.5,
 63.0,
 63.5,
 64.0,
 64.5,
 65.0,
 65.5,
 66.0,
 66.5,
 67.0,
 67.5,
 68.0,
 68.5,
 69.0,
 69.5,
 70.0,
 70.5,
 71.0,
 71.5,
 72.0,
 72.5

In [80]:
num_list / 2

array([0.0000e+00, 5.0000e-01, 1.0000e+00, ..., 4.9985e+03, 4.9990e+03,
       4.9995e+03])

# 2. Questions: lambda functions

Two questions:

- General syntax (see here for a reference: https://www.w3schools.com/python/python_lambda.asp 
- How they work in the context of aggregations

How is a lambda function different from a "normal" user-defined function (that has the syntax def func_name(arg): etc?

- Operates similarly to normal user-defined functions in that it can take any # of arguments
- Operates differently in that it's an "anonymous" function or a function that we don't explicitly name/save in memory

In [22]:
def f1(x,y):
    return x+y

f2 = lambda x, y: x+y

f1(2,1)
f2(2,1)

3

3

In [51]:
multiply = lambda x,y,z: x+y+z
multiply(2,3,4)

9

In [52]:
multiply("QSS","20","<3")

'QSS20<3'

In [None]:
def multiply(x,y)

## 2.1 General syntax for lambda functions

In [46]:
### two pools of courses
socsci = ["QSS20", "QSS17", "GOV10"]
natsci = ["BIO2", "PHYS3"]


## generalize some of the steps
## above into a two-arg function
## that takes the course prefix
## and a list of all courses
def filter_courses(prefix,all_courses):
    rel_courses = [c for c in all_courses if prefix in c]
    return(rel_courses)

### a few applications 
filter_courses(prefix = "QSS", all_courses = socsci)
filter_courses(prefix = "QSS", all_courses = natsci)
filter_courses(prefix = "BIO", all_courses = natsci)

['QSS20', 'QSS17']

[]

['BIO2']

In [47]:
## what's the lambda function version of this
filter_courses_v2 = lambda prefix, all_courses: [c for c in all_courses if prefix in c]
filter_courses_v2(prefix = "BIO", all_courses = natsci)


['BIO2']

## 2.2 using alongside agg

In [50]:
## use lambda to find modal block in a ward- multiple ways

### way 1: subsetting agg syntex
df.groupby("WARD")["BLOCK"].agg(lambda x: x.mode())

### way 2: dictionary agg syntax
df.groupby("WARD").agg({"BLOCK": lambda x: x.mode()})


WARD
1           3100 - 3299 BLOCK OF 14TH STREET NW
2    1300 - 1699 BLOCK OF CONNECTICUT AVENUE NW
3      5300 - 5399 BLOCK OF WISCONSIN AVENUE NW
4          100 - 199 BLOCK OF CARROLL STREET NW
5     900 - 999 BLOCK OF RHODE ISLAND AVENUE NE
6                600 - 699 BLOCK OF H STREET NE
7         934 - 1099 BLOCK OF EASTERN AVENUE NE
8        2300 - 2399 BLOCK OF GOOD HOPE ROAD SE
Name: BLOCK, dtype: object

Unnamed: 0_level_0,BLOCK
WARD,Unnamed: 1_level_1
1,3100 - 3299 BLOCK OF 14TH STREET NW
2,1300 - 1699 BLOCK OF CONNECTICUT AVENUE NW
3,5300 - 5399 BLOCK OF WISCONSIN AVENUE NW
4,100 - 199 BLOCK OF CARROLL STREET NW
5,900 - 999 BLOCK OF RHODE ISLAND AVENUE NE
6,600 - 699 BLOCK OF H STREET NE
7,934 - 1099 BLOCK OF EASTERN AVENUE NE
8,2300 - 2399 BLOCK OF GOOD HOPE ROAD SE


#### Your turn: Group by WARD and get the mean and standard deviation (std) of X and Y

In [81]:
df.groupby("WARD").agg({
    "X": ["mean", "std"],
    "Y": ["mean", "std"]})

Unnamed: 0_level_0,X,X,Y,Y
Unnamed: 0_level_1,mean,std,mean,std
WARD,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,-77.031256,0.0072,38.924716,0.006417
2,-77.038959,0.014106,38.905941,0.00536
3,-77.07532,0.013408,38.941557,0.013887
4,-77.024906,0.011811,38.957674,0.01392
5,-76.989451,0.014957,38.920249,0.012766
6,-77.001175,0.011302,38.893112,0.010021
7,-76.947402,0.018753,38.888669,0.012504
8,-76.988986,0.012791,38.850526,0.015775


In [83]:
df.groupby("WARD")[["Y","X"]].agg(["mean", "std"])


Unnamed: 0_level_0,Y,Y,X,X
Unnamed: 0_level_1,mean,std,mean,std
WARD,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,38.924716,0.006417,-77.031256,0.0072
2,38.905941,0.00536,-77.038959,0.014106
3,38.941557,0.013887,-77.07532,0.013408
4,38.957674,0.01392,-77.024906,0.011811
5,38.920249,0.012766,-76.989451,0.014957
6,38.893112,0.010021,-77.001175,0.011302
7,38.888669,0.012504,-76.947402,0.018753
8,38.850526,0.015775,-76.988986,0.012791


In [84]:
df.groupby("WARD").agg({'X': [lambda x: x.mean(), lambda x: x.std()],
                        'Y': [lambda x: x.mean(), lambda x: x.std()]})

Unnamed: 0_level_0,X,X,Y,Y
Unnamed: 0_level_1,<lambda_0>,<lambda_1>,<lambda_0>,<lambda_1>
WARD,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,-77.031256,0.0072,38.924716,0.006417
2,-77.038959,0.014106,38.905941,0.00536
3,-77.07532,0.013408,38.941557,0.013887
4,-77.024906,0.011811,38.957674,0.01392
5,-76.989451,0.014957,38.920249,0.012766
6,-77.001175,0.011302,38.893112,0.010021
7,-76.947402,0.018753,38.888669,0.012504
8,-76.988986,0.012791,38.850526,0.015775
