# Welcome to the Bird Probability Project

To run this code ensure you have downloaded the 'bird_data.csv' data set. See the README.md for details on where to download the data

In [131]:
import math
import pandas as pd
import numpy as np
import datetime
import random
import json


In [132]:
# data = pd.read_csv('ebd_US-CO_202001_202312_smp_relMar-2024/ebd_US-CO_202001_202312_smp_relMar-2024.txt', sep = '\t')
# data = data[['COMMON NAME', 'COUNTY', 'COUNTY CODE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'NUMBER OBSERVERS']]
# data.to_csv('bird_data.csv')

data = pd.read_csv('bird_data.csv')

data.head(5)

Unnamed: 0.1,Unnamed: 0,COMMON NAME,COUNTY,COUNTY CODE,LATITUDE,LONGITUDE,OBSERVATION DATE,NUMBER OBSERVERS
0,0,Accipiter sp.,Adams,US-CO-001,39.947339,-104.747209,2020-01-10,1.0
1,1,American Crow,Adams,US-CO-001,39.797274,-104.93106,2020-01-01,5.0
2,2,American Crow,Adams,US-CO-001,39.789034,-104.90584,2020-01-02,1.0
3,3,American Crow,Adams,US-CO-001,39.830492,-104.948991,2020-01-01,10.0
4,4,American Crow,Adams,US-CO-001,39.830492,-104.948991,2020-01-01,10.0


In [133]:
counties = data['COUNTY'].unique()
birds = data['COMMON NAME'].unique()

rawDate = datetime.datetime.now()

month = rawDate.month
day = rawDate.day


def getBirds(day, month, plusOrMinus):
    """
    returns a data frame of birds that have been observed from various years during the same day and month
    plus_or_minus so many days.
    
    """
    def distance(date, target):
        '''
        Helper function
        returns the number of days away the date is from the target
        '''
        givenMonth = int(date[5:7])
        targetMonth = int(target[5:7])
        givenDay = int(date[8:])
        targetDay = int(target[8:])
        
        total = abs(givenMonth - targetMonth)*30 #grab the difference in months
        
        if givenMonth > targetMonth:
            total = total - targetDay + givenDay
        elif givenMonth < targetMonth:
            total = total - givenDay + targetDay
        else: 
            total = abs(givenDay - targetDay)
        
        return total
        
     
    if month > 10:
        target = 'xxxx-{}-{}'.format(month,day) if day > 10 else 'xxxx-{}-0{}'.format(month,day)
    else:
        target = 'xxxx-0{}-{}'.format(month,day) if day > 10 else 'xxxx-0{}-0{}'.format(month,day)
        

    # This is not the best implementation, I am just lazy and didn't want to search for the right one (help if you can)
    mask = pd.Series([distance(date, target) <= plusOrMinus for date in data['OBSERVATION DATE']])
        
    return data[mask]


class Item:
    def __init__(self, name, birds, birdCounts = {}):
        self.name = name
        self.features = {}
        self.birdCounts = birdCounts #dictionary of bird name and count 
        
    def getFeatures(self):
        big = max(list(self.birdCounts.values()))
        for bird in self.birdCounts:
            self.features[bird] = self.birdCounts[bird]/big
        
    
class User:
    def __init__(self, ratings, birds):
        self.features = {}
        self.ratings = ratings
        
    def getFeatures(self):
        big = max(list(self.ratings.values()))
        for bird in self.ratings:
            self.features[bird] = self.ratings[bird]/big


def eucDist(a, b): 
    total = 0
    for (i,j) in zip(a,b):
        total += (i-j)**2
    return math.sqrt(total)
    

### Concern:
Features are going to be the proportion of that bird in the area. Should this be weighted by count? If so that could skew the data so cities with higher populations get better ratings, so probably not. 

In [134]:
seasonal_birds_within_20_days = getBirds(day,month, 10)

unique_birds_in_season = seasonal_birds_within_20_days['COMMON NAME'].unique()

groupByCounty = data.groupby(by = ["COUNTY"])['COMMON NAME'].value_counts()

groupByCounty






countyFeatures = []

for county in counties:
    counts = {i: groupByCounty[county][i] for i in groupByCounty[county].index}
    countyFeatures.append(Item(county, birds, counts))


for county in countyFeatures:
    county.getFeatures()

In [135]:
def getUserInput():
    '''
    Recieve input from user to get user's favorite bird scores
    returns dictionary of user's bird scores
    '''
    randomBirds = random.choices(list(birds), k=10)
    print('Please enter in your favorite birds and your '\
              'rating for them out of 5! \nPlease use dictionary '\
              'format: \n{{"Canada Goose": 5, "Common Raven":5}} \n'\
              'Here are some example birds for you to choose from: \n\n {} \n\n'\
              'Enter "Q" to quit\nOr "C" to use the example dictionary as your bird scores'.format(randomBirds))
    while True:  
        userInput = input()
        if userInput == 'Q':
            print('Exiting...')
            return 'EXIT'
        if userInput == 'C': # Shortcut for testing!!
            return {"Canada Goose": 5, "Common Raven":5}
        try:
            dictBirds = json.loads(userInput)
            print('Input recieved, calculating best counties for you to visit...\n')
            return dictBirds
        except json.JSONDecodeError:
            print("Please make sure your input is in the correct dictionary format.\n")

            
            
            
            
            
def getTop5Counties(user):
    '''
    Find top 5 counties for user
    Use cosine similarity to compare user's scores to county's bird populations
    returns list containing top 5 counties
    '''
    
    countyScores = []
    
    for county in countyFeatures:
        # Only calculate cosine similrity on
        # birds (in counties) the user has rated (has a non-zero value)
        user_ratings = []
        county_ratings = []
        for index, (bird, user_rating) in enumerate(user.features.items()):
            if user_rating != 0:
                user_ratings.append(user_rating)
                county_ratings.append(county.features.get(bird, 0))  # If bird not in county features, use 0
        countyScores.append((county, eucDist(user_ratings, county_ratings)))
    countyScores = sorted(countyScores, key = lambda x: x[1], reverse=False)
    # for (county, score) in countyScores[:5]:
    #     print(f'{county.name} score of Canada Goose: {county.features["Canada Goose"]}, Common Raven: {county.features["Common Raven"]}')
    top5 = [x[0].name for x in countyScores[:5]]
    return top5





def recomendedCounties():
    '''
    Combine the above functions into a final one liner recomendedCounties function
    Prompts the user for their favorite birds
    returns the recomended counties for a user to visit
    '''
    ratings = getUserInput() # Get user ratings
    if ratings == 'EXIT':
        return
    user = User(ratings, birds)
    user.getFeatures()
    top5 = getTop5Counties(user) 
    print('The best counties to see the birds you like are: ', top5)
    
            
        
        
        
recomendedCounties()

Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Canada Goose": 5, "Common Raven":5} 
Here are some example birds for you to choose from: 

 ['Pacific Loon', 'Short-billed Dowitcher', "Black x Say's Phoebe (hybrid)", 'Redhead x Lesser Scaup (hybrid)', 'Northern Harrier', "Forster's/Common Tern", 'jaeger sp.', 'yellow-bellied kingbird sp.', 'Black-bellied Plover/golden-plover sp.', 'Great Horned Owl'] 

Enter "Q" to quit
Or "C" to use the example dictionary as your bird scores


 C


The best counties to see the birds you like are:  ['Jackson', 'Montrose', 'Mesa', 'Montezuma', 'Alamosa']


# Sanity Checks

There are 4 sanity checks. To skip to the results of a sanity check, in each cell look at the large comment surrownded by ''' '''. If you would like to run more component of the sanity checks yourself, uncomment some commented out portions of code

#### Santiy Check 1:

Test if recomended counties have high concentration of users favorite bird.

EX: If user loves Turkey Vultures (user's Turkey Vulture normalized rating is 1.0) then the counties which are recomended should have a very high normalized Turkey Vulture rating (around 1.0)

In [124]:
# Game Plan:
# - Find a bird which has a rating of 1.0 in a county
# - Make a list of counties which has selectBird rating of 1.0
# - Run recomendedCounties() and enter in that specific bird. We expect to see the counties with the highest % of that bird appearing as most frequent.

# Debugging code:
# - Inside recomended counties
# - - Output bird scores in those counties
# - - Also output the total score each of the counties got after cosine sim. We expect the counties with select bird ratng of 1.0 have the same rating




# #  This iterates through each county and prints bird names with a score of 1.0
# for county in countyFeatures:
#     print("Birds with a rating of 1.0 in", county.name + ":")
#     for bird, rating in county.features.items():
#         if rating == 1.0:
#             print(bird)

# # Find bird with rating of 1.0 in a county
# selectBird = 'House Finch'
# print("Counties with a", selectBird, "rating of 1.0:")
# for county in countyFeatures:
#     for bird, rating in county.features.items():
#         if bird == selectBird and rating == 1.0:
#             print(county.name)
       
    
'''
These counties have a 'House Finch' rating of 1.0
Arapahoe
Boulder
Broomfield
Denver
Douglas
Elbert
El Paso
Jefferson

Running recomendedCounties() and inputting {"House Finch": 5} we expect to get 5 of the above counties recomended to us

It recomends we visit the counties:
['Arapahoe', 'Boulder', 'Broomfield', 'Denver', 'Douglas']

Success
'''
recomendedCounties()

Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Canada Goose": 5, "Common Raven":5} 
Here are some example birds for you to choose from: 

 ['Least Bittern', 'Golden-winged x Blue-winged Warbler (hybrid)', 'Black-headed Grosbeak', 'Dunlin', 'Greater Yellowlegs', 'White Ibis', 'Glossy/White-faced Ibis', 'Greater Sage-Grouse', 'Surf/Black Scoter', 'Long-tailed/Parasitic Jaeger'] 

Enter "Q" to quit
Or "C" to use the example dictionary as your bird scores


 {"House Finch": 5}


Input recieved, calculating best counties for you to visit...

The best counties to see the birds you like are:  ['Arapahoe', 'Boulder', 'Broomfield', 'Denver', 'Douglas']


#### Santiy Check 2:

Test if recomended counties have high concentration of users favorite birds.

This is the same as Sanity Check 1, but now with multiple birds which are the user's favorite

In [125]:
'''
Game Plan:
- Find a bird which has a rating of 1.0 in a county
- Make a list of counties which has selectBird rating of 1.0
- Run recomendedCounties() and enter in that specific bird. We expect to see the counties with the highest % of that bird appearing as most frequent.

Debugging code:
- Inside recomended counties
- - Output bird scores in those counties
- - Also output the total score each of the counties got after cosine sim. We expect the counties with select bird ratng of 1.0 have the same rating
'''




# #  This iterates through each county and prints bird names with a score > 0.99
# for county in countyFeatures:
#     print("Birds with a rating >= 0.99 in", county.name + ":")
#     for bird, rating in county.features.items():
#         if rating >= 0.99:
#             print(bird)

# {"Canada Goose": 5, "Common Raven":5}



# Find bird with rating of 1.0 in a county
print("Counties with a rating >= 0.7 of Canada Goose and Common Raven")
for county in countyFeatures:
    canada_goose_rating = county.features.get('Canada Goose', 0)
    common_raven_rating = county.features.get('Common Raven', 0)
    if canada_goose_rating >= 0.7 and common_raven_rating >= 0.7:
        print(county.name)
        
        
'''
These counties have a rating >= 0.7 of Canada Goose and Common Raven
Jackson
Mesa
Montrose

We expect these counties to be among the top counties recomended

It recomends we visit the counties:
['Jackson', 'Montrose', 'Mesa', 'Montezuma', 'Alamosa']

Success
'''
            

# Running recomendedCounties() and inputting {"House Finch": 5} we expect to get 5 of the above counties recomended to us
recomendedCounties()

Counties with a rating >= 0.7 of Canada Goose and Common Raven
Jackson
Mesa
Montrose
Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Canada Goose": 5, "Common Raven":5} 
Here are some example birds for you to choose from: 

 ["Ross's Goose", 'Common Merganser', "Western x Clark's Grebe (hybrid)", 'Common Goldeneye x Hooded Merganser (hybrid)', 'White-winged Crossbill', 'Dunlin', 'bluebird sp.', "Anna's Hummingbird", 'Eastern/Chihuahuan Meadowlark', 'Common Yellowthroat'] 

Enter "Q" to quit
Or "C" to use the example dictionary as your bird scores


 {"Canada Goose": 5, "Common Raven":5}


Input recieved, calculating best counties for you to visit...

The best counties to see the birds you like are:  ['Jackson', 'Montrose', 'Mesa', 'Montezuma', 'Alamosa']


#### Sanity check 3: 
We expect the user's favorite birds are among the most popular in the recomended counties

In [128]:
countyName = 'Adams'

# Get the seltected county
for county in countyFeatures:
    if county.name == countyName:
        countySelect = county
        break

# Sort birds by frequency in the county
sorted_birds = sorted(countySelect.features.items(), key=lambda x: x[1], reverse=True)


# Find the ranking of Bald Eagle and Red-winged Blackbird
bald_eagle_rank = None
red_winged_blackbird_rank = None

for i, (bird, _) in enumerate(sorted_birds, start=1):
    if bird == "Bald Eagle":
        bald_eagle_rank = i
    elif bird == "Red-winged Blackbird":
        red_winged_blackbird_rank = i

print("Ranking of 'Bald Eagle' in",countySelect.name,"County: top", bald_eagle_rank,'out of',len(birds),'birds')
print("Ranking of 'Red-winged Blackbird' in",countySelect.name,"County: top", red_winged_blackbird_rank,'out of',len(birds),'birds')


'''
In this test we entered n {"Bald Eagle": 5, "Red-winged Blackbird": 5}
as our birds which are our most favorite

The counties it recomends we visit are:
['Adams', 'Morgan', 'Logan', 'Moffat', 'Washington']

The top county is 'Adams' county

In Adams county the 'Bald Eagle' is the top 15 most frequently seen out of 746 birds
and the 'Red-winged Blackbird' is in the top 3 most frequntly out of 746 birds

Success
'''


# Enter {"Bald Eagle": 5, "Red-winged Blackbird": 5} to run this sanity check
recomendedCounties()

Ranking of 'Bald Eagle' in Adams County: top 15 out of 746 birds
Ranking of 'Red-winged Blackbird' in Adams County: top 3 out of 746 birds
Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Canada Goose": 5, "Common Raven":5} 
Here are some example birds for you to choose from: 

 ['Rock Pigeon', 'Common Goldeneye', 'Peregrine Falcon', "Vaux's Swift", 'Budgerigar', 'Northern Bobwhite', 'Ash-throated Flycatcher', 'Ammospiza sp.', 'Anas sp.', 'Selasphorus sp.'] 

Enter "Q" to quit
Or "C" to use the example dictionary as your bird scores


 {"Bald Eagle": 5, "Red-winged Blackbird": 5}


Input recieved, calculating best counties for you to visit...

The best counties to see the birds you like are:  ['Adams', 'Morgan', 'Logan', 'Moffat', 'Washington']


#### Sanity Check 4:
Another way to sanity check is to work backwards

So we will take a selected county, see what it's most popular birds are, and then when a user inputs those birds as being their top, that county should be among the recomended

In [130]:
countyName = 'Phillips'

# Get the seltected county
for county in countyFeatures:
    if county.name == countyName:
        countySelect = county
        break
        
# Print top 5 birds in the selected county
sorted_birds = sorted(countySelect.features.items(), key=lambda x: x[1], reverse=True)
print("Top 5 birds in",countySelect.name,"county:", sorted_birds[:5], "\n")

'''
Here are the top 5 birds from Phillips county to test out
[('Mourning Dove', 1.0),
('Eurasian Collared-Dove', 0.9581151832460733), 
('Western Meadowlark', 0.9336823734729494), 
('Horned Lark', 0.9075043630017452), 
('American Robin', 0.8446771378708552)]

So we enter in:
{"Mourning Dove": 5, "Eurasian Collared-Dove": 5, "Western Meadowlark": 5, "Horned Lark": 5, "American Robin": 5}

Here are the counties it recomends we visit:
['Phillips', 'Kit Carson', 'Sedgwick', 'Morgan', 'Yuma']

Success
'''
recomendedCounties()

Top 5 birds in Phillips county: [('Mourning Dove', 1.0), ('Eurasian Collared-Dove', 0.9581151832460733), ('Western Meadowlark', 0.9336823734729494), ('Horned Lark', 0.9075043630017452), ('American Robin', 0.8446771378708552)] 

Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Canada Goose": 5, "Common Raven":5} 
Here are some example birds for you to choose from: 

 ["Western/Clark's Grebe", 'Acorn Woodpecker', 'Trumpeter Swan', 'Lark Bunting', "Bewick's Wren", 'Parasitic Jaeger', 'Red-necked Phalarope', 'Brant', 'Western/Chihuahuan Meadowlark', 'Golden Eagle'] 

Enter "Q" to quit
Or "C" to use the example dictionary as your bird scores


 {"Mourning Dove": 5, "Eurasian Collared-Dove": 5, "Western Meadowlark": 5, "Horned Lark": 5, "American Robin": 5}


Input recieved, calculating best counties for you to visit...

The best counties to see the birds you like are:  ['Phillips', 'Kit Carson', 'Sedgwick', 'Morgan', 'Yuma']
