# 2018 NYC Squirrel Census

## Notebook Set Up

In [2]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

In [3]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [4]:
# assign the squirrels_db database to a variable name
db = mongo['squirrels_db']

In [5]:
# review the collections in our database
print(db.list_collection_names())

['squirrels']


In [6]:
# assign the collection to a variable
squirrels = db['squirrels']

# Part 3: Exploratory Analysis

In [6]:
# Total number of documents
print("Number of documents: ", squirrels.count_documents({}))

# Total number of unique squirrels
unqiue_squirrels = squirrels.distinct("Unique_Squirrel_ID")
print("Number of unique squirrels in data set is: ",len(unqiue_squirrels))

Number of documents:  3023
Number of unique squirrels in data set is:  3018


<h5> When were the squirrels spotted?

In [7]:
# When were the squirrels spotted?
query = [{'$group': {'_id': "$Shift", 'count': { '$sum': 1 }}}]
results = list(squirrels.aggregate(query))
print(results)

[{'_id': 'AM', 'count': 1347}, {'_id': 'PM', 'count': 1676}]


In [None]:
# Look at specific dates the data was collected

<h5> Where they were spotted (on the ground or up a tree)

In [8]:
# Where were the squirrels?
query = [{'$group': {'_id': "$Location", 'count': { '$sum': 1 }}}]
results = list(squirrels.aggregate(query))
pprint(results)

# Save results to a dataframe
location_df = pd.DataFrame(results)
location_df

[{'_id': 'Above Ground', 'count': 843},
 {'_id': 'Ground Plane', 'count': 2116},
 {'_id': 'Not_noted', 'count': 64}]


Unnamed: 0,_id,count
0,Above Ground,843
1,Ground Plane,2116
2,Not_noted,64


In [9]:
# Where were they spotted?
query = {'Location' : 'Above Ground'}
fields = {'Unique_Squirrel_ID' : 1, 'Location' : 1, 'Above_Ground_Sighter_Measurement' : 1}
sort = [('Above_Ground_Sighter_Measurement', 1)]
result = list(squirrels.find(query, fields).sort(sort))

# # Save results to a dataframe
height_df = pd.DataFrame(result)
height_df

Unnamed: 0,_id,Unique_Squirrel_ID,Location,Above_Ground_Sighter_Measurement
0,642cca8711739165e7ee2e56,15C-PM-1017-02,Above Ground,0
1,642cca8711739165e7ee2c69,12A-PM-1013-04,Above Ground,1
2,642cca8711739165e7ee2d4f,38E-AM-1010-07,Above Ground,1
3,642cca8711739165e7ee2d54,21H-AM-1017-02,Above Ground,1
4,642cca8711739165e7ee2d82,18C-PM-1018-03,Above Ground,1
...,...,...,...,...
838,642cca8711739165e7ee3576,42H-PM-1014-02,Above Ground,
839,642cca8711739165e7ee35af,31H-PM-1008-04,Above Ground,
840,642cca8711739165e7ee35b2,2A-AM-1010-09,Above Ground,
841,642cca8711739165e7ee35ba,5F-AM-1007-02,Above Ground,


In [10]:
height_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 843 entries, 0 to 842
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   _id                               843 non-null    object
 1   Unique_Squirrel_ID                843 non-null    object
 2   Location                          843 non-null    object
 3   Above_Ground_Sighter_Measurement  843 non-null    object
dtypes: object(4)
memory usage: 26.5+ KB


In [None]:
# filter by those with a above ground sighter measurement
# do a chart with results - possibly histogram
# change ave ground sighter measurement to number

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 843 entries, 0 to 842
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   _id                               843 non-null    object
 1   Unique Squirrel ID                843 non-null    object
 2   Location                          843 non-null    object
 3   Above Ground Sighter Measurement  843 non-null    object
dtypes: object(4)
memory usage: 26.5+ KB


<h5> Average height for those above ground

In [11]:
# Above ground average height
match_query = {'$match': {'Location': 'Above_Ground'}}
group_query = {'$group': {'_id': "$Location", 'count': { '$sum': 1 }}}
sort_values = {'$sort': { 'count': -1 }}

pipeline = [match_query, group_query, sort_values]

In [12]:
# Run the pipeline through the aggregate method and save the results to a variable
results = list(census_2018.aggregate(pipeline))
pprint(results)

[]


<h5> What was the primary colour of the squirrels spotted?

In [13]:
# What colour were the squirrels
query = [{'$group': {'_id': "$Primary_Fur_Color", 'count': { '$sum': 1 }}}]
results = list(squirrels.aggregate(query))
pprint(results)

# Save results to a dataframe
color_df = pd.DataFrame(results)
color_df

## sort the results

[{'_id': 'Cinnamon', 'count': 392},
 {'_id': 'Gray', 'count': 2473},
 {'_id': 'Not_noted', 'count': 55},
 {'_id': 'Black', 'count': 103}]


Unnamed: 0,_id,count
0,Cinnamon,392
1,Gray,2473
2,Not_noted,55
3,Black,103


<h5> Age of squirrel

In [14]:
# What age were the squirrels
query = [{'$group': {'_id': "$Age", 'count': { '$sum': 1 }}}]
results = list(squirrels.aggregate(query))
pprint(results)

# Save results to a dataframe
age_df = pd.DataFrame(results)
age_df

## sort the results

[{'_id': 'Adult', 'count': 2568},
 {'_id': 'Juvenile', 'count': 330},
 {'_id': 'Not_noted', 'count': 125}]


Unnamed: 0,_id,count
0,Adult,2568
1,Juvenile,330
2,Not_noted,125


<h5> What the squirrel was doing

In [14]:
# What the squirrels were doing
query = [{'$group': {'_id': "$Running", 'count': { '$sum': 1 }}}]
eating = list(squirrels.aggregate(query))
pprint(eating)

query = [{'$group': {'_id': "$Running", 'count': { '$sum': 1 }}}]
running = list(squirrels.aggregate(query))
pprint(running)




[{'_id': 'FALSE', 'count': 2293}, {'_id': 'TRUE', 'count': 730}]
[{'_id': 'FALSE', 'count': 2293}, {'_id': 'TRUE', 'count': 730}]


In [12]:
query = [{'$group': {'_id': "$Hectare_Squirrel_Number", 'count': { '$sum': 1 }}}]
hectare = list(squirrels.aggregate(query))
pprint(hectare)

[{'_id': 19, 'count': 1},
 {'_id': 1, 'count': 614},
 {'_id': 13, 'count': 23},
 {'_id': 21, 'count': 1},
 {'_id': 11, 'count': 42},
 {'_id': 3, 'count': 441},
 {'_id': 15, 'count': 10},
 {'_id': 17, 'count': 4},
 {'_id': 10, 'count': 54},
 {'_id': 22, 'count': 1},
 {'_id': 7, 'count': 161},
 {'_id': 18, 'count': 1},
 {'_id': 8, 'count': 119},
 {'_id': 14, 'count': 16},
 {'_id': 6, 'count': 223},
 {'_id': 5, 'count': 287},
 {'_id': 9, 'count': 85},
 {'_id': 4, 'count': 364},
 {'_id': 2, 'count': 533},
 {'_id': 23, 'count': 1},
 {'_id': 16, 'count': 8},
 {'_id': 20, 'count': 1},
 {'_id': 12, 'count': 33}]


In [11]:
hectare.len()

NameError: name 'hectare' is not defined