In [9]:
!pip3 install -U result-reporter

Collecting result-reporter
  Downloading result_reporter-0.2.1-py3-none-any.whl (3.3 kB)
Installing collected packages: result-reporter
  Attempting uninstall: result-reporter
    Found existing installation: result-reporter 0.2.0
    Uninstalling result-reporter-0.2.0:
      Successfully uninstalled result-reporter-0.2.0
Successfully installed result-reporter-0.2.1


In [10]:
try:
    import rr
except ModuleNotFoundError:
    import sys
    import os
    path = list(filter(lambda p: p.endswith('site-packages'), sys.path))[0]
    parts = path.split(os.sep)
    new_parts = []
    for part in parts:
        if part == 'lib':
            break
        new_parts.append(part)
        path = os.sep.join(new_parts + ['bin', 'pip3'])
    print(f'Try running this in a new cell: !{path}', 'install -U result-reporter --index-url=https://pypi.org/simple')

rr.set_global_endpoint('https://result-reporter.com/ingest')
rr.set_global_token('be22f3b1-b8e5-462d-8487-844027f3d454')

**Intro**

This exercise practices use large data sets. From fetching to minimal data analysis.

**Challenge**

Simply explore the data for the first half of this exercise. Then follow the function doc strings for development in the second half. 

**Data**

Last month of 311 NYC data:

- https://data.cityofnewyork.us/Social-Services/Last-Month/j7mx-4wd6
- https://storage.googleapis.com/columbia-python-bootcamp-ieor/NYC-311-last_month.zip (~20 MiB) [USE THIS TO PASS ASSERTS]

In [11]:
import pathlib

# Change Me!
EXERCISE_DATA_PATH = pathlib.Path('/Users/skg/Desktop/test/msba_python/Last_Month.csv')

In [12]:
def get_data() -> str:
    with open(EXERCISE_DATA_PATH) as fp:
        return fp.read()

In [13]:
### BEGIN TESTS
def get_data_range():
    return 100_000_000 <= len(get_data()) <= 200_000_000
### END TESTS

In [14]:
### BEGIN TESTS
with rr.Wrap(get_data_range) as func:
    result = func()
    assert result, len(get_data())
### END TESTS

In [15]:
# Let's take a quick look at the data.
get_data()[:1024]

'Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,Street Name,Cross Street 1,Cross Street 2,Intersection Street 1,Intersection Street 2,Address Type,City,Landmark,Facility Type,Status,Due Date,Resolution Description,Resolution Action Updated Date,Community Board,Borough,X Coordinate (State Plane),Y Coordinate (State Plane),Park Facility Name,Park Borough,Vehicle Type,Taxi Company Borough,Taxi Pick Up Location,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Latitude,Longitude,Location\n37818993,12/01/2017 12:00:12 AM,12/01/2017 04:11:19 AM,NYPD,New York City Police Department,Noise - Residential,Banging/Pounding,Residential Building/House,10461,2504 FRISBY AVENUE,FRISBY AVENUE,ST PETERS AVENUE,OVERING STREET,,,ADDRESS,BRONX,,Precinct,Closed,12/01/2017 08:00:12 AM,The Police Department responded to the complaint and with the information available observed no evidence of the violation

In [16]:
import csv

def get_data() -> list:
    """
    Return the data as a list of dicts.
    
    Overwrite the existing get_data function.
    """
    with open(EXERCISE_DATA_PATH) as fp:
        return list(csv.DictReader(fp))

In [17]:
### BEGIN TESTS
def get_data_len():
    return len(get_data())
### END TESTS

In [18]:
### BEGIN TESTS
with rr.Wrap(get_data_len) as func:
    result = func()
    assert result == 219274, result
### END TESTS

In [19]:
### BEGIN TESTS
def get_data_type():
    return isinstance(get_data()[0], dict)
### END TESTS

In [20]:
### BEGIN TESTS
with rr.Wrap(get_data_type) as func:
    result = func()
    assert result, type(get_data()[0])
### END TESTS

In [43]:
def min_max_lat(data) -> (float, float):
    """
    Return the minimum and maximum latitudes in the data set.
    
    Ignore latitudes that can not be cast to a float.
    """
    #print(data[0])
    min_lat = 100000
    max_lat = -10000
    for item in data:
        try:
            lat = float(item['Latitude'])
        except:
            lat = min_lat
        if  lat<min_lat:
            min_lat=float(item['Latitude'])
        if lat >max_lat:
            max_lat=float(item['Latitude'] )
    return(min_lat,max_lat)
    #raise NotImplementedError()


In [44]:
### BEGIN TESTS
def get_min_max_lat():
    return min_max_lat(get_data())
### END TESTS

In [45]:
### BEGIN TESTS
with rr.Wrap(get_min_max_lat) as func:
    result = func()
    assert result == (40.49965901258716, 40.912868795316655), result
### END TESTS

### Parse Resolution Description

Explore the "Resolution Description" key's value a bit. Then, count the number of lines that have any of a given set of keywords. Use sets to arrive at your solution.

In [46]:
first_row = get_data()[0]
first_row

{'Unique Key': '37818993',
 'Created Date': '12/01/2017 12:00:12 AM',
 'Closed Date': '12/01/2017 04:11:19 AM',
 'Agency': 'NYPD',
 'Agency Name': 'New York City Police Department',
 'Complaint Type': 'Noise - Residential',
 'Descriptor': 'Banging/Pounding',
 'Location Type': 'Residential Building/House',
 'Incident Zip': '10461',
 'Incident Address': '2504 FRISBY AVENUE',
 'Street Name': 'FRISBY AVENUE',
 'Cross Street 1': 'ST PETERS AVENUE',
 'Cross Street 2': 'OVERING STREET',
 'Intersection Street 1': '',
 'Intersection Street 2': '',
 'Address Type': 'ADDRESS',
 'City': 'BRONX',
 'Landmark': '',
 'Facility Type': 'Precinct',
 'Status': 'Closed',
 'Due Date': '12/01/2017 08:00:12 AM',
 'Resolution Description': 'The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.',
 'Resolution Action Updated Date': '12/01/2017 04:11:19 AM',
 'Community Board': '10 BRONX',
 'Borough': 'BRONX',
 'X Coordinate (State 

In [47]:
res_desc = first_row.get('Resolution Description')
res_desc

'The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time.'

In [48]:
res_desc.split()[:20]

['The',
 'Police',
 'Department',
 'responded',
 'to',
 'the',
 'complaint',
 'and',
 'with',
 'the',
 'information',
 'available',
 'observed',
 'no',
 'evidence',
 'of',
 'the',
 'violation',
 'at',
 'that']

In [52]:
def predicate(text: str, keywords: set) -> bool:
    """
    Return true if text has any one of the words listed in keywords.
    """
    original_list = text.split()
    lowercase_list = [x.lower() for x in original_list]

    for word in keywords:
        if word in lowercase_list:
            return True
    return False
    #raise NotImplementedError()


In [53]:
### BEGIN TESTS
with rr.Wrap(predicate) as func:
    result = func(
        'New Fiction Cashier mole eleVATOR',
        set(['fiction']),
    )
    assert result
### END TESTS

In [54]:
### BEGIN TESTS
with rr.Wrap(predicate) as func:
    result = func(
        'New Fiction Cashier mole eleVATOR',
        set(['fiction', 'elevator', '']),
    )
    assert result
### END TESTS

In [55]:
### BEGIN TESTS
def get_result_not_pred_empty():
    return not predicate(
        'New Fiction Cashier mole eleVATOR',
        set([]),
    )
### END TESTS

In [56]:
### BEGIN TESTS
with rr.Wrap(get_result_not_pred_empty) as func:
    result = func()
    assert result
### END TESTS

In [57]:
### BEGIN TESTS
def get_result_not_pred_coffee():
    return not predicate(
        'New Fiction Cashier mole eleVATOR',
        set(['coffee']),
    )
### END TESTS

In [58]:
### BEGIN TESTS
with rr.Wrap(get_result_not_pred_coffee) as func:
    result = func()
    assert result
### END TESTS

In [59]:
### BEGIN TESTS
with rr.Wrap(predicate) as func:
    result = func(
        res_desc,
        set(['evidence']),
    )
    assert result
### END TESTS

In [70]:
def count_fraction(data: list, keywords: set) -> float:
    """
    Count the number of rows (represented as dicts) that have a
    'Resolution Description' with one of the keywords listed in 'keywords'.
    """
    counter = 0
    for row in data:
        des = row['Resolution Description']
        if predicate(des,keywords):
            counter+=1
    return counter/len(data)
    #raise NotImplementedError()


In [71]:
### BEGIN TESTS
def count_fraction_empty():
    return count_fraction(
        get_data(),
        set([]),
    )
### END TESTS

In [72]:
### BEGIN TESTS
with rr.Wrap(count_fraction_empty) as func:
    result = func()
    assert result == 0, result
### END TESTS

In [73]:
### BEGIN TESTS
def count_fraction_landlord():
    return count_fraction(
        get_data(),
        {'landlord'},
    )
### END TESTS

In [74]:
### BEGIN TESTS
with rr.Wrap(count_fraction_landlord) as func:
    result = func()
    assert result == 0.00020522268942054232, result
### END TESTS