## 0. Load imports 

In [1]:
## imports
import pandas as pd
import numpy as np


# ## print multiple things from same cell
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

## 0. Load data

In [2]:
## load data on 2020 crimes in DC
df = dc_crim_2020 = pd.read_csv("https://opendata.arcgis.com/datasets/f516e0dd7b614b088ad781b0c4002331_2.csv")
dc_crim_2020.head()
dc_crim_2020.shape
dc_crim_2020.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27933 entries, 0 to 27932
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   X                     27933 non-null  float64
 1   Y                     27933 non-null  float64
 2   CCN                   27933 non-null  int64  
 3   REPORT_DAT            27933 non-null  object 
 4   SHIFT                 27933 non-null  object 
 5   METHOD                27933 non-null  object 
 6   OFFENSE               27933 non-null  object 
 7   BLOCK                 27933 non-null  object 
 8   XBLOCK                27933 non-null  float64
 9   YBLOCK                27933 non-null  float64
 10  WARD                  27933 non-null  int64  
 11  ANC                   27933 non-null  object 
 12  DISTRICT              27904 non-null  float64
 13  PSA                   27898 non-null  float64
 14  NEIGHBORHOOD_CLUSTER  27933 non-null  object 
 15  BLOCK_GROUP        

In [3]:
df.columns

Index(['X', 'Y', 'CCN', 'REPORT_DAT', 'SHIFT', 'METHOD', 'OFFENSE', 'BLOCK',
       'XBLOCK', 'YBLOCK', 'WARD', 'ANC', 'DISTRICT', 'PSA',
       'NEIGHBORHOOD_CLUSTER', 'BLOCK_GROUP', 'CENSUS_TRACT',
       'VOTING_PRECINCT', 'LATITUDE', 'LONGITUDE', 'BID', 'START_DATE',
       'END_DATE', 'OBJECTID', 'OCTO_RECORD_ID'],
      dtype='object')

In [4]:
df.OFFENSE
df.OFFENSE.value_counts()

OFFENSE
THEFT/OTHER                   10927
THEFT F/AUTO                   8283
MOTOR VEHICLE THEFT            3260
ROBBERY                        1998
ASSAULT W/DANGEROUS WEAPON     1632
BURGLARY                       1444
HOMICIDE                        198
SEX ABUSE                       178
ARSON                            13
Name: count, dtype: int64

In [5]:
plus_1 = lambda x: x + 1

In [6]:
df.X.apply(plus_1)

0       -75.993094
1       -76.027049
2       -75.947655
3       -76.032861
4       -75.979766
           ...    
27928   -76.040861
27929   -76.020915
27930   -75.965994
27931   -76.024174
27932   -76.029716
Name: X, Length: 27933, dtype: float64

#### Warm-up: Ways to add one to column X
And how to time your code

## 1. Aggregation

In [7]:
df[["WARD", "OFFENSE", "SHIFT"]]

Unnamed: 0,WARD,OFFENSE,SHIFT
0,5,THEFT F/AUTO,EVENING
1,2,THEFT/OTHER,EVENING
2,7,THEFT/OTHER,EVENING
3,4,THEFT F/AUTO,DAY
4,5,MOTOR VEHICLE THEFT,DAY
...,...,...,...
27928,2,BURGLARY,DAY
27929,6,THEFT/OTHER,DAY
27930,7,BURGLARY,DAY
27931,1,ASSAULT W/DANGEROUS WEAPON,DAY


In [8]:
for i, ddf in df.groupby("WARD"):
    break
ddf[["WARD", "OFFENSE", "SHIFT"]]

Unnamed: 0,WARD,OFFENSE,SHIFT
8,1,ROBBERY,MIDNIGHT
24,1,THEFT/OTHER,DAY
27,1,THEFT/OTHER,DAY
32,1,THEFT/OTHER,DAY
46,1,THEFT F/AUTO,EVENING
...,...,...,...
27909,1,THEFT F/AUTO,DAY
27913,1,MOTOR VEHICLE THEFT,EVENING
27921,1,THEFT F/AUTO,EVENING
27924,1,THEFT F/AUTO,MIDNIGHT


### Example of grouping by one variable and doing one aggregation 

**Task**: find and print the number of unique offense types (`OFFENSE`) by ward (`WARD`)

In [9]:
offense_by_ward = df.groupby("WARD").agg({"OFFENSE" : "nunique"}).reset_index()
offense_by_ward

Unnamed: 0,WARD,OFFENSE
0,1,9
1,2,9
2,3,8
3,4,8
4,5,9
5,6,9
6,7,9
7,8,9


In [10]:
def get_unique(X):
    Y = X.unique()
    return len(Y)

In [11]:
offense_by_ward = df.groupby("WARD").agg({"OFFENSE" : get_unique}).reset_index()
offense_by_ward

Unnamed: 0,WARD,OFFENSE
0,1,9
1,2,9
2,3,8
3,4,8
4,5,9
5,6,9
6,7,9
7,8,9


### Example of grouping by one variable and providing two summaries of the same variable

**Task**: previous showed number of offenses by ward but want to find out content of offenses in each ward; create an aggregation that summarizes both the number of unique offenses by ward and what those offenses are -- for instance, by pasting the unique offenses in that ward separated by the ";" (e.g., Theft; Burglary;...)

*Hint*: you can use the join command to paste together a list separated by some delimiter. 
    - The syntax, if we are using the comma delimiter, is: ", ".join(nameoflist)

In [12]:
offense_by_ward2 = df.groupby("WARD").agg({"OFFENSE" : ["nunique", "unique"]}).reset_index()
offense_by_ward2

Unnamed: 0_level_0,WARD,OFFENSE,OFFENSE
Unnamed: 0_level_1,Unnamed: 1_level_1,nunique,unique
0,1,9,"[ROBBERY, THEFT/OTHER, THEFT F/AUTO, MOTOR VEH..."
1,2,9,"[THEFT/OTHER, THEFT F/AUTO, MOTOR VEHICLE THEF..."
2,3,8,"[ASSAULT W/DANGEROUS WEAPON, THEFT/OTHER, THEF..."
3,4,8,"[THEFT F/AUTO, THEFT/OTHER, ROBBERY, MOTOR VEH..."
4,5,9,"[THEFT F/AUTO, MOTOR VEHICLE THEFT, ROBBERY, T..."
5,6,9,"[THEFT/OTHER, ASSAULT W/DANGEROUS WEAPON, BURG..."
6,7,9,"[THEFT/OTHER, BURGLARY, MOTOR VEHICLE THEFT, T..."
7,8,9,"[THEFT/OTHER, ASSAULT W/DANGEROUS WEAPON, HOMI..."


In [13]:
def get_separated_list(X):
    Y = X.unique()
    return "; ".join(Y)

In [14]:
offense_by_ward2 = df.groupby("WARD").agg({"OFFENSE" : get_separated_list}).reset_index()
offense_by_ward2

Unnamed: 0,WARD,OFFENSE
0,1,ROBBERY; THEFT/OTHER; THEFT F/AUTO; MOTOR VEHI...
1,2,THEFT/OTHER; THEFT F/AUTO; MOTOR VEHICLE THEFT...
2,3,ASSAULT W/DANGEROUS WEAPON; THEFT/OTHER; THEFT...
3,4,THEFT F/AUTO; THEFT/OTHER; ROBBERY; MOTOR VEHI...
4,5,THEFT F/AUTO; MOTOR VEHICLE THEFT; ROBBERY; TH...
5,6,THEFT/OTHER; ASSAULT W/DANGEROUS WEAPON; BURGL...
6,7,THEFT/OTHER; BURGLARY; MOTOR VEHICLE THEFT; TH...
7,8,THEFT/OTHER; ASSAULT W/DANGEROUS WEAPON; HOMIC...


In [15]:
OF = df.groupby(["WARD"])["OFFENSE"].unique().apply(lambda x: "; ".join(x)).reset_index()
OF

Unnamed: 0,WARD,OFFENSE
0,1,ROBBERY; THEFT/OTHER; THEFT F/AUTO; MOTOR VEHI...
1,2,THEFT/OTHER; THEFT F/AUTO; MOTOR VEHICLE THEFT...
2,3,ASSAULT W/DANGEROUS WEAPON; THEFT/OTHER; THEFT...
3,4,THEFT F/AUTO; THEFT/OTHER; ROBBERY; MOTOR VEHI...
4,5,THEFT F/AUTO; MOTOR VEHICLE THEFT; ROBBERY; TH...
5,6,THEFT/OTHER; ASSAULT W/DANGEROUS WEAPON; BURGL...
6,7,THEFT/OTHER; BURGLARY; MOTOR VEHICLE THEFT; TH...
7,8,THEFT/OTHER; ASSAULT W/DANGEROUS WEAPON; HOMIC...


### Example of grouping by two variables 

**Task**: group by ward (`WARD`) and police shift (`SHIFT`) and find the offense that is most common in that ward and shift

In [16]:
def most_common(X):
    Y = X.value_counts()
    return Y.index[0]

In [17]:
ward_shift = df.groupby(["WARD", "SHIFT"]).agg({"OFFENSE" : most_common}).reset_index()
ward_shift

Unnamed: 0,WARD,SHIFT,OFFENSE
0,1,DAY,THEFT/OTHER
1,1,EVENING,THEFT/OTHER
2,1,MIDNIGHT,THEFT/OTHER
3,2,DAY,THEFT/OTHER
4,2,EVENING,THEFT/OTHER
5,2,MIDNIGHT,THEFT/OTHER
6,3,DAY,THEFT/OTHER
7,3,EVENING,THEFT/OTHER
8,3,MIDNIGHT,THEFT/OTHER
9,4,DAY,THEFT F/AUTO


## Summarizing over all rows or all columns (without grouping)

We can also use the `apply` function to summarize rows or columns efficiently

**Task**: find the mean lat and longitude in one line of code

In [18]:
lat_long = df[["LATITUDE", "LONGITUDE"]].apply(np.mean)
lat_long

LATITUDE     38.907613
LONGITUDE   -77.007049
dtype: float64

## 2. Creating new columns/transforming their type 

### Simple filtering

`np.where` is in the numpy package (aliased as `np`) and operates similar to `ifelse` in R

But there is a built in pandas way to emulate this.

**Task**: create an indicator `is_theft` for any offense that contains the word "THEFT"
    
**Task**: create an indicator `is_theft_notmotor` for any offense that contains the word "THEFT" but does not contain the word "MOTOR"

In [19]:
df["is_theft"] = df.OFFENSE.str.contains("THEFT")
df["is_theft_notmotor"] = (df.OFFENSE.str.contains("THEFT") == True) & (df.OFFENSE.str.contains("MOTOR") == False)
df

Unnamed: 0,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,VOTING_PRECINCT,LATITUDE,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,is_theft,is_theft_notmotor
0,-76.993094,38.928471,20153901,2020/10/27 20:37:21+00,EVENING,OTHERS,THEFT F/AUTO,900 - 999 BLOCK OF IRVING STREET NE,400599.0,140065.0,...,Precinct 73,38.928463,-76.993092,,2020/10/27 20:26:31+00,2020/10/27 20:37:29+00,398217667,,True,True
1,-77.027049,38.904911,20095149,2020/06/30 20:46:51+00,EVENING,OTHERS,THEFT/OTHER,1101 - 1199 BLOCK OF 11TH STREET NW,397654.0,137450.0,...,Precinct 129,38.904904,-77.027047,,2020/06/29 23:30:04+00,2020/06/30 11:00:09+00,398217668,,True,True
2,-76.947655,38.896660,20064305,2020/04/27 23:44:04+00,EVENING,OTHERS,THEFT/OTHER,4000 - 4121 BLOCK OF MINNESOTA AVENUE NE,404541.0,136535.0,...,Precinct 99,38.896652,-76.947653,,2020/04/27 22:49:30+00,,398217670,,True,True
3,-77.032861,38.956717,20100388,2020/07/12 18:37:14+00,DAY,OTHERS,THEFT F/AUTO,5500 - 5599 BLOCK OF COLORADO AVENUE NW,397152.0,143201.0,...,Precinct 53,38.956709,-77.032858,,2020/07/12 17:36:13+00,2020/07/12 18:10:45+00,398217673,,True,True
4,-76.979766,38.915804,20128992,2020/09/09 18:00:44+00,DAY,OTHERS,MOTOR VEHICLE THEFT,2100 - 2131 BLOCK OF WEST VIRGINIA AVENUE NE,401755.0,138659.0,...,Precinct 76,38.915796,-76.979764,,2020/09/09 01:30:07+00,2020/09/10 23:00:57+00,398217674,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27928,-77.040861,38.905205,20109618,2020/08/01 15:12:23+00,DAY,OTHERS,BURGLARY,1130 - 1199 BLOCK OF CONNECTICUT AVENUE NW,396456.0,137483.0,...,Precinct 17,38.905197,-77.040859,GOLDEN TRIANGLE,2020/07/31 19:00:19+00,2020/08/01 15:13:33+00,398740090,,False,False
27929,-77.020915,38.902526,20152852,2020/10/25 18:25:58+00,DAY,OTHERS,THEFT/OTHER,600 - 699 BLOCK OF K STREET NW,398186.0,137185.0,...,Precinct 18,38.902518,-77.020913,MOUNT VERNON TRIANGLE CID,2020/10/25 17:00:17+00,2020/10/25 17:10:27+00,398740091,,True,True
27930,-76.965994,38.876867,20020481,2020/02/03 16:50:47+00,DAY,OTHERS,BURGLARY,2900 - 2999 BLOCK OF M STREET SE,402951.0,134337.0,...,Precinct 111,38.876859,-76.965991,,2020/02/03 16:10:39+00,2020/02/03 16:50:48+00,398740098,,False,False
27931,-77.024174,38.925028,20075827,2020/05/22 18:42:22+00,DAY,GUN,ASSAULT W/DANGEROUS WEAPON,700 - 799 BLOCK OF FAIRMONT STREET NW,397904.0,139683.0,...,Precinct 37,38.925020,-77.024172,,2020/05/22 17:27:34+00,2020/05/22 18:42:31+00,398740104,,False,False


### np.select

**Task**: create a new variable, `offense_summary`, where you:
        
- Recode theft offenses that use a gun or knife as the method (`METHOD`) as: violent theft
- Recode non-theft offenses that use a gun or knife as the method as: violent other
- Recode all other as non-violent 

In [20]:
methods = ["violent theft", "violent other"]

conditions = [
    df.is_theft & (df.METHOD.isin(["GUN", "KNIFE"])),
    ~df.is_theft & (df.METHOD.isin(["GUN", "KNIFE"]))
]

df["offense_summary"] = np.select(conditions, methods, default="non-violent")
df.offense_summary.value_counts()

offense_summary
non-violent      25257
violent other     2641
violent theft       35
Name: count, dtype: int64

### map.recode

**Task**: recode shifts that are MIDNIGHT or EVENING as "nighttime"; code other shift to daytime

In [22]:
shifts_original = ["MIDNIGHT", "EVENING"]
shifts_new = ["nighttime", "nighttime"]

conversion_dict = dict(zip(shifts_original, shifts_new))

df["recoded_shift"] = df.SHIFT.map(conversion_dict).fillna("daytime")

In [23]:
df

Unnamed: 0,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,LONGITUDE,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,is_theft,is_theft_notmotor,offense_summary,recoded_shift
0,-76.993094,38.928471,20153901,2020/10/27 20:37:21+00,EVENING,OTHERS,THEFT F/AUTO,900 - 999 BLOCK OF IRVING STREET NE,400599.0,140065.0,...,-76.993092,,2020/10/27 20:26:31+00,2020/10/27 20:37:29+00,398217667,,True,True,non-violent,nighttime
1,-77.027049,38.904911,20095149,2020/06/30 20:46:51+00,EVENING,OTHERS,THEFT/OTHER,1101 - 1199 BLOCK OF 11TH STREET NW,397654.0,137450.0,...,-77.027047,,2020/06/29 23:30:04+00,2020/06/30 11:00:09+00,398217668,,True,True,non-violent,nighttime
2,-76.947655,38.896660,20064305,2020/04/27 23:44:04+00,EVENING,OTHERS,THEFT/OTHER,4000 - 4121 BLOCK OF MINNESOTA AVENUE NE,404541.0,136535.0,...,-76.947653,,2020/04/27 22:49:30+00,,398217670,,True,True,non-violent,nighttime
3,-77.032861,38.956717,20100388,2020/07/12 18:37:14+00,DAY,OTHERS,THEFT F/AUTO,5500 - 5599 BLOCK OF COLORADO AVENUE NW,397152.0,143201.0,...,-77.032858,,2020/07/12 17:36:13+00,2020/07/12 18:10:45+00,398217673,,True,True,non-violent,daytime
4,-76.979766,38.915804,20128992,2020/09/09 18:00:44+00,DAY,OTHERS,MOTOR VEHICLE THEFT,2100 - 2131 BLOCK OF WEST VIRGINIA AVENUE NE,401755.0,138659.0,...,-76.979764,,2020/09/09 01:30:07+00,2020/09/10 23:00:57+00,398217674,,True,False,non-violent,daytime
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27928,-77.040861,38.905205,20109618,2020/08/01 15:12:23+00,DAY,OTHERS,BURGLARY,1130 - 1199 BLOCK OF CONNECTICUT AVENUE NW,396456.0,137483.0,...,-77.040859,GOLDEN TRIANGLE,2020/07/31 19:00:19+00,2020/08/01 15:13:33+00,398740090,,False,False,non-violent,daytime
27929,-77.020915,38.902526,20152852,2020/10/25 18:25:58+00,DAY,OTHERS,THEFT/OTHER,600 - 699 BLOCK OF K STREET NW,398186.0,137185.0,...,-77.020913,MOUNT VERNON TRIANGLE CID,2020/10/25 17:00:17+00,2020/10/25 17:10:27+00,398740091,,True,True,non-violent,daytime
27930,-76.965994,38.876867,20020481,2020/02/03 16:50:47+00,DAY,OTHERS,BURGLARY,2900 - 2999 BLOCK OF M STREET SE,402951.0,134337.0,...,-76.965991,,2020/02/03 16:10:39+00,2020/02/03 16:50:48+00,398740098,,False,False,non-violent,daytime
27931,-77.024174,38.925028,20075827,2020/05/22 18:42:22+00,DAY,GUN,ASSAULT W/DANGEROUS WEAPON,700 - 799 BLOCK OF FAIRMONT STREET NW,397904.0,139683.0,...,-77.024172,,2020/05/22 17:27:34+00,2020/05/22 18:42:31+00,398740104,,False,False,violent other,daytime


### Using built-in pandas methods

Another way of creating variables, used either alone or in combination with np.where and np.select is to use pandas built in `str` methods

Basic structure is: `df['namestringcol'].str.someoperation`

**Task**: using a `str` method, create a new variable--`OFFENSE_NOSP`--that replaces spaces in the `OFFENSE` column with underscores

In [25]:
df["OFFENSE_NOSP"] = df["OFFENSE"].str.replace(" ", "_")
df

Unnamed: 0,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,BID,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,is_theft,is_theft_notmotor,offense_summary,recoded_shift,OFFENSE_NOSP
0,-76.993094,38.928471,20153901,2020/10/27 20:37:21+00,EVENING,OTHERS,THEFT F/AUTO,900 - 999 BLOCK OF IRVING STREET NE,400599.0,140065.0,...,,2020/10/27 20:26:31+00,2020/10/27 20:37:29+00,398217667,,True,True,non-violent,nighttime,THEFT_F/AUTO
1,-77.027049,38.904911,20095149,2020/06/30 20:46:51+00,EVENING,OTHERS,THEFT/OTHER,1101 - 1199 BLOCK OF 11TH STREET NW,397654.0,137450.0,...,,2020/06/29 23:30:04+00,2020/06/30 11:00:09+00,398217668,,True,True,non-violent,nighttime,THEFT/OTHER
2,-76.947655,38.896660,20064305,2020/04/27 23:44:04+00,EVENING,OTHERS,THEFT/OTHER,4000 - 4121 BLOCK OF MINNESOTA AVENUE NE,404541.0,136535.0,...,,2020/04/27 22:49:30+00,,398217670,,True,True,non-violent,nighttime,THEFT/OTHER
3,-77.032861,38.956717,20100388,2020/07/12 18:37:14+00,DAY,OTHERS,THEFT F/AUTO,5500 - 5599 BLOCK OF COLORADO AVENUE NW,397152.0,143201.0,...,,2020/07/12 17:36:13+00,2020/07/12 18:10:45+00,398217673,,True,True,non-violent,daytime,THEFT_F/AUTO
4,-76.979766,38.915804,20128992,2020/09/09 18:00:44+00,DAY,OTHERS,MOTOR VEHICLE THEFT,2100 - 2131 BLOCK OF WEST VIRGINIA AVENUE NE,401755.0,138659.0,...,,2020/09/09 01:30:07+00,2020/09/10 23:00:57+00,398217674,,True,False,non-violent,daytime,MOTOR_VEHICLE_THEFT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27928,-77.040861,38.905205,20109618,2020/08/01 15:12:23+00,DAY,OTHERS,BURGLARY,1130 - 1199 BLOCK OF CONNECTICUT AVENUE NW,396456.0,137483.0,...,GOLDEN TRIANGLE,2020/07/31 19:00:19+00,2020/08/01 15:13:33+00,398740090,,False,False,non-violent,daytime,BURGLARY
27929,-77.020915,38.902526,20152852,2020/10/25 18:25:58+00,DAY,OTHERS,THEFT/OTHER,600 - 699 BLOCK OF K STREET NW,398186.0,137185.0,...,MOUNT VERNON TRIANGLE CID,2020/10/25 17:00:17+00,2020/10/25 17:10:27+00,398740091,,True,True,non-violent,daytime,THEFT/OTHER
27930,-76.965994,38.876867,20020481,2020/02/03 16:50:47+00,DAY,OTHERS,BURGLARY,2900 - 2999 BLOCK OF M STREET SE,402951.0,134337.0,...,,2020/02/03 16:10:39+00,2020/02/03 16:50:48+00,398740098,,False,False,non-violent,daytime,BURGLARY
27931,-77.024174,38.925028,20075827,2020/05/22 18:42:22+00,DAY,GUN,ASSAULT W/DANGEROUS WEAPON,700 - 799 BLOCK OF FAIRMONT STREET NW,397904.0,139683.0,...,,2020/05/22 17:27:34+00,2020/05/22 18:42:31+00,398740104,,False,False,violent other,daytime,ASSAULT_W/DANGEROUS_WEAPON


### Transforming column types

**Task**: check the type of the `START_DATE` column

In [32]:
df.START_DATE

0        2020/10/27 20:26:31+00
1        2020/06/29 23:30:04+00
2        2020/04/27 22:49:30+00
3        2020/07/12 17:36:13+00
4        2020/09/09 01:30:07+00
                  ...          
27928    2020/07/31 19:00:19+00
27929    2020/10/25 17:00:17+00
27930    2020/02/03 16:10:39+00
27931    2020/05/22 17:27:34+00
27932    2020/05/17 16:00:51+00
Name: START_DATE, Length: 27933, dtype: object

In [35]:
df["START_DATE"].dtypes

dtype('O')

In [27]:
type(df.START_DATE)

pandas.core.series.Series

**Task**: recast the `START_DATE` column to datetime, calling this `START_DATE_CLEAN`; coerce errors rather than cleaning the string; check the type of that new col

In [39]:
df["START_DATE_CLEAN"] = pd.to_datetime(df.START_DATE, errors="coerce")
df["START_DATE_CLEAN"].dtypes

datetime64[ns, UTC]

**Task:** print the min and max of `START_DATE_CLEAN`. What happens if you try to do this with `START_DATE`?

In [41]:
print(df.START_DATE_CLEAN.min())
print(df.START_DATE_CLEAN.max())

1940-02-17 00:00:16+00:00
2021-01-01 02:30:21+00:00


In [43]:
print(df.START_DATE.min())
print(df.START_DATE.max())

TypeError: '<=' not supported between instances of 'str' and 'float'

## 3. Row and column filtering 

### Row subsetting

**Task**: filter to crime reports about theft using the `is_theft` definition and that are in ward 3

In [51]:
filter1 = df[df["is_theft"] & (df.WARD == 3)]
filter1.head()

Unnamed: 0,X,Y,CCN,REPORT_DAT,SHIFT,METHOD,OFFENSE,BLOCK,XBLOCK,YBLOCK,...,START_DATE,END_DATE,OBJECTID,OCTO_RECORD_ID,is_theft,is_theft_notmotor,offense_summary,recoded_shift,OFFENSE_NOSP,START_DATE_CLEAN
11,-77.051908,38.924199,20170566,2020/11/30 20:53:51+00,EVENING,OTHERS,THEFT/OTHER,2600 - 2649 BLOCK OF CONNECTICUT AVENUE NW,395499.0,139592.0,...,2020/11/30 16:45:06+00,2020/11/30 17:30:13+00,398217704,,True,True,non-violent,nighttime,THEFT/OTHER,2020-11-30 16:45:06+00:00
18,-77.083768,38.960042,20121641,2020/08/26 03:29:34+00,MIDNIGHT,OTHERS,THEFT F/AUTO,5300 - 5399 BLOCK OF 43RD STREET NW,392740.0,143573.0,...,2020/08/25 22:30:03+00,2020/08/26 01:00:04+00,398217732,,True,True,non-violent,nighttime,THEFT_F/AUTO,2020-08-25 22:30:03+00:00
28,-77.060408,38.938806,20160695,2020/11/10 05:15:06+00,MIDNIGHT,OTHERS,THEFT/OTHER,3700 - 3899 BLOCK OF CONNECTICUT AVENUE NW,394763.0,141214.0,...,2020/11/10 03:47:06+00,2020/11/10 05:15:44+00,398217791,,True,True,non-violent,nighttime,THEFT/OTHER,2020-11-10 03:47:06+00:00
59,-77.074228,38.962589,20035501,2020/02/27 22:44:09+00,EVENING,OTHERS,THEFT/OTHER,5420 - 5499 BLOCK OF CONNECTICUT AVENUE NW,393567.0,143855.0,...,2020/02/27 18:50:10+00,2020/02/27 22:00:06+00,398217992,,True,True,non-violent,nighttime,THEFT/OTHER,2020-02-27 18:50:10+00:00
74,-77.056309,38.933503,20001654,2020/01/03 20:09:51+00,EVENING,OTHERS,THEFT F/AUTO,2700 - 2899 BLOCK OF MACOMB STREET NW,395118.0,140625.0,...,2020/01/03 19:38:39+00,2020/01/03 20:10:05+00,398218125,,True,True,non-violent,nighttime,THEFT_F/AUTO,2020-01-03 19:38:39+00:00


### Column subsetting


**Task**: select two columns--`START_DATE` and `END_DATE`--and print the head of the dataframe

In [49]:
cols = ["START_DATE", "END_DATE"]
filter2 = df[cols]
filter2.head()

Unnamed: 0,START_DATE,END_DATE
0,2020/10/27 20:26:31+00,2020/10/27 20:37:29+00
1,2020/06/29 23:30:04+00,2020/06/30 11:00:09+00
2,2020/04/27 22:49:30+00,
3,2020/07/12 17:36:13+00,2020/07/12 18:10:45+00
4,2020/09/09 01:30:07+00,2020/09/10 23:00:57+00


**Task**: use list comprehension to automate this a bit and select all columns with the word "DATE" in the name; print the head of the dataframe

In [50]:
cols2 = [col for col in df.columns if "DATE" in col]
filter3 = df[cols2]
filter3.head()

Unnamed: 0,START_DATE,END_DATE,START_DATE_CLEAN
0,2020/10/27 20:26:31+00,2020/10/27 20:37:29+00,2020-10-27 20:26:31+00:00
1,2020/06/29 23:30:04+00,2020/06/30 11:00:09+00,2020-06-29 23:30:04+00:00
2,2020/04/27 22:49:30+00,,2020-04-27 22:49:30+00:00
3,2020/07/12 17:36:13+00,2020/07/12 18:10:45+00,2020-07-12 17:36:13+00:00
4,2020/09/09 01:30:07+00,2020/09/10 23:00:57+00,2020-09-09 01:30:07+00:00


**Task**: filter reports of crime in `START_DATE_CLEAN` before May 2020 and that are located in ward 1; select the ward and `START_DATE` column and print a random sample of rows

In [55]:
cols3 = ["WARD", "START_DATE_CLEAN"]
filter4 = df[cols3]
filter4 = filter4[(filter4["START_DATE_CLEAN"] < "2020-05-01") & (filter4["WARD"] == 1)]
filter4.sample(10)

Unnamed: 0,WARD,START_DATE_CLEAN
10528,1,2020-01-18 16:11:40+00:00
6792,1,2020-01-25 04:00:02+00:00
12073,1,2020-01-29 19:00:45+00:00
14794,1,2020-02-02 23:45:29+00:00
11954,1,2020-01-12 06:00:51+00:00
21219,1,2020-01-19 01:00:20+00:00
27909,1,2020-01-23 02:00:17+00:00
27085,1,2020-04-24 20:15:15+00:00
27184,1,2020-03-16 23:00:28+00:00
6683,1,2020-04-05 23:00:33+00:00
