# Welcome to the Bird Probability Project

To run this code ensure you have downloaded the 'bird_data.csv' data set. See the README.md for details on where to download the data

In [1]:
import pandas as pd
import numpy as np
import datetime
import random
import json


In [2]:
# data = pd.read_csv('ebd_US-CO_202001_202312_smp_relMar-2024/ebd_US-CO_202001_202312_smp_relMar-2024.txt', sep = '\t')
# data = data[['COMMON NAME', 'COUNTY', 'COUNTY CODE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'NUMBER OBSERVERS']]
# data.to_csv('bird_data.csv')

data = pd.read_csv('bird_data.csv')

data.head(5)

Unnamed: 0.1,Unnamed: 0,COMMON NAME,COUNTY,COUNTY CODE,LATITUDE,LONGITUDE,OBSERVATION DATE,NUMBER OBSERVERS
0,0,Accipiter sp.,Adams,US-CO-001,39.947339,-104.747209,2020-01-10,1.0
1,1,American Crow,Adams,US-CO-001,39.797274,-104.93106,2020-01-01,5.0
2,2,American Crow,Adams,US-CO-001,39.789034,-104.90584,2020-01-02,1.0
3,3,American Crow,Adams,US-CO-001,39.830492,-104.948991,2020-01-01,10.0
4,4,American Crow,Adams,US-CO-001,39.830492,-104.948991,2020-01-01,10.0


In [32]:
counties = data['COUNTY'].unique()
birds = data['COMMON NAME'].unique()

rawDate = datetime.datetime.now()

month = rawDate.month
day = rawDate.day


def getBirds(day, month, plusOrMinus):
    """
    returns a data frame of birds that have been observed from various years during the same day and month
    plus_or_minus so many days.
    
    """
    def distance(date, target):
        '''
        Helper function
        returns the number of days away the date is from the target
        '''
        givenMonth = int(date[5:7])
        targetMonth = int(target[5:7])
        givenDay = int(date[8:])
        targetDay = int(target[8:])
        
        total = abs(givenMonth - targetMonth)*30 #grab the difference in months
        
        if givenMonth > targetMonth:
            total = total - targetDay + givenDay
        elif givenMonth < targetMonth:
            total = total - givenDay + targetDay
        else: 
            total = abs(givenDay - targetDay)
        
        return total
        
     
    if month > 10:
        target = 'xxxx-{}-{}'.format(month,day) if day > 10 else 'xxxx-{}-0{}'.format(month,day)
    else:
        target = 'xxxx-0{}-{}'.format(month,day) if day > 10 else 'xxxx-0{}-0{}'.format(month,day)
        

    # This is not the best implementation, I am just lazy and didn't want to search for the right one (help if you can)
    mask = pd.Series([distance(date, target) <= plusOrMinus for date in data['OBSERVATION DATE']])
        
    return data[mask]


class County:
    def __init__(self, name, birds, birdCounts):
        self.name = name
        self.features = {bird: 0.0 for bird in birds}
        self.birdCounts = birdCounts #dictionary of bird name and count 
        
    def getFeatures(self):
        big = max(list(self.birdCounts.values()))
        for bird in self.birdCounts:
            self.features[bird] = self.birdCounts[bird]/big
        
    
class User:
    def __init__(self, ratings, birds):
        self.features = {bird: 0.0 for bird in birds}
        self.ratings = ratings
        
    def getFeatures(self):
        big = max(list(self.ratings.values()))
        for bird in self.ratings:
            self.features[bird] = self.ratings[bird]/big


def cosSim(a, b):           
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    

### Concern:
Features are going to be the proportion of that bird in the area. Should this be weighted by count? If so that could skew the data so cities with higher populations get better ratings, so probably not. 

In [33]:
seasonal_birds_within_20_days = getBirds(day,month, 10)

unique_birds_in_season = seasonal_birds_within_20_days['COMMON NAME'].unique()

groupByCounty = data.groupby(by = ["COUNTY"])['COMMON NAME'].value_counts()

groupByCounty


COUNTY  COMMON NAME         
Adams   Canada Goose            19275
        European Starling       18137
        Red-winged Blackbird    17126
        Mallard                 17013
        Red-tailed Hawk         14750
                                ...  
Yuma    grebe sp.                   1
        heron sp.                   1
        small falcon sp.            1
        teal sp.                    1
        wren sp.                    1
Name: COMMON NAME, Length: 22281, dtype: int64

In [34]:
countyFeatures = []

for county in counties:
    counts = {i: groupByCounty[county][i] for i in groupByCounty[county].index}
    countyFeatures.append(County(county, birds, counts))


for county in countyFeatures:
    county.getFeatures()

In [61]:
def getUserInput():
    '''
    Recieve input from user to get user's favorite bird scores
    returns dictionary of user's bird scores
    '''
    randomBirds = random.choices(list(birds), k=10)
    print('Please enter in your favorite birds and your '\
              'rating for them out of 5! \nPlease use dictionary '\
              'format: \n{{"Bald Eagle": 5, "Red-winged Blackbird": 4}} \n'\
              'Here are some example birds for you to choose from: \n\n {} \n\n'\
              'Enter "Q" to quit\nOr "B" to use the example dictionary as your bird scores'.format(randomBirds))
    while True:  
        userInput = input()
        if userInput == 'Q':
            print('Exiting...')
            return 'EXIT'
        if userInput == 'B': # Shortcut for testing!!
            return {"Bald Eagle": 5, "Red-winged Blackbird": 4}
        try:
            dictBirds = json.loads(userInput)
            print('Input recieved, calculating best counties for you to visit...\n')
            return dictBirds
        except json.JSONDecodeError:
            print("Please make sure your input is in the correct dictionary format.\n")

def getTop5Scores(user):
    '''
    Find top 5 counties for user
    Use cosine similarity to compare user's scores to county's bird populations
    returns list containing top 5 counties
    '''
    
    countyScore = []
    
    for county in countyFeatures:
        # Only calculate cosine similrity on
        # birds (in counties) the user has rated (has a non-zero value)
        user_ratings = []
        county_ratings = []
        for bird, user_rating in user.ratings.items():
            if user_rating != 0:
                user_ratings.append(user_rating)
                county_ratings.append(county.features.get(bird, 0))  # If bird not in county features, use 0
        countyScore.append((county.name, cosSim(user_ratings, county_ratings)))
    
    
#     # Old way of calculating county score
#     for county in countyFeatures:
#         # print('county values:', list(county.features.values()))
#         # print('user values:', list(user.features.values()))
        
#         countyScore.append((county.name, cosSim(list(county.features.values()), list(user.features.values()))))
    countyScore = sorted(countyScore, key = lambda x: x[1])
    top5 = [x[0] for x in countyScore[:5]]
    return top5



    
def recomendedCounties():
    ratings = getUserInput() # Get user ratings
    if ratings == 'EXIT':
        return
    user = User(ratings, birds)
    user.getFeatures()
    top5 = getTop5Scores(user) 
    print('The best counties to see the birds you like are: ', top5)
    
recomendedCounties()

Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Bald Eagle": 5, "Red-winged Blackbird": 4} 
Here are some example birds for you to choose from: 

 ['Yellow-crowned/Black-crowned Night Heron', 'merganser sp.', 'Black Phoebe', 'Western/Semipalmated Sandpiper', 'Sterna sp.', 'Pectoral Sandpiper', 'Lesser/Greater Yellowlegs', 'Cape May Warbler', 'Turkey Vulture', 'Redhead x Ring-necked Duck (hybrid)'] 

Enter "Q" to quit
Or "B" to use the example dictionary as your bird scores


 B


The best counties to see the birds you like are:  ['Kit Carson', 'Phillips', 'Lincoln', 'Yuma', 'Prowers']


## Sanity Check



#### Sanity check 1: 
We expect the user's favorite birds are among the most popular in the recomended counties


In [None]:
'''
Enter in a county name here to test if your favorite birds are among its most popular
'''
countyName = 'San Juan'


# Get the seltected county
for county in countyFeatures:
    if county.name == countyName:
        countySelect = county
        break

# Sort birds by frequency in the county
sorted_birds = sorted(countySelect.features.items(), key=lambda x: x[1], reverse=True)


# Find the ranking of Bald Eagle and Red-winged Blackbird
bald_eagle_rank = None
red_winged_blackbird_rank = None

for i, (bird, _) in enumerate(sorted_birds, start=1):
    if bird == "Bald Eagle":
        bald_eagle_rank = i
    elif bird == "Red-winged Blackbird":
        red_winged_blackbird_rank = i

print("Ranking of 'Bald Eagle' in",countySelect.name,"County:", bald_eagle_rank,'out of',len(birds),'birds')
print("Ranking of 'Red-winged Blackbird' in",countySelect.name,"County:", red_winged_blackbird_rank,'out of',len(birds),'birds')

# Enter 
recomendedCounties()

Ranking of 'Bald Eagle' in San Juan County: 94 out of 746 birds
Ranking of 'Red-winged Blackbird' in San Juan County: 40 out of 746 birds
Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Bald Eagle": 5, "Red-winged Blackbird": 4} 
Here are some example birds for you to choose from: 

 ["MacGillivray's x Mourning Warbler (hybrid)", 'Surf Scoter', 'Streptopelia sp.', 'Canyon Wren', "Black x Say's Phoebe (hybrid)", "Ross's x Greater White-fronted Goose (hybrid)", 'Mallard x Mexican Duck (hybrid)', 'merganser sp.', 'screech-owl sp.', 'Eastern Wood-Pewee'] 

Enter "Q" to quit
Or "B" to use the example dictionary as your bird scores


#### Sanity Check 2:
Another way to sanity check is to work backwards

So we will take a selected county, see what it's most popular birds are, and then when a user inputs those birds as being their top, that county should be among the recomended

In [65]:
countyName = 'Phillips'

# Get the seltected county
for county in countyFeatures:
    if county.name == countyName:
        countySelect = county
        break
        
# Print top 5 birds in the selected county
sorted_birds = sorted(countySelect.features.items(), key=lambda x: x[1], reverse=True)
print("Top 5 birds in",countySelect.name,"county:", sorted_birds[:5], "\n")

'''
Here are the top 5 birds from San Juan county to test out
{"American Robin": 1, "Pine Siskin": 1, "Dark-eyed Junco": 1, "Mountain Chickadee": 1, "White-crowned Sparrow": 1}

When I enter in the above birds, I get these as my recomended counties: 
['Phillips', 'Weld', 'Arapahoe', 'Logan', 'Broomfield']
'''






recomendedCounties()

Top 5 birds in Phillips county: [('Mourning Dove', 1.0), ('Eurasian Collared-Dove', 0.9581151832460733), ('Western Meadowlark', 0.9336823734729494), ('Horned Lark', 0.9075043630017452), ('American Robin', 0.8446771378708552)] 

Please enter in your favorite birds and your rating for them out of 5! 
Please use dictionary format: 
{"Bald Eagle": 5, "Red-winged Blackbird": 4} 
Here are some example birds for you to choose from: 

 ['Eurasian/African Collared-Dove', 'Mexican/Mottled Duck', 'Rough-legged Hawk', 'American Goldfinch', 'Red-necked Grebe', 'Cave Swallow', 'small plover sp.', 'gull sp.', 'swift sp.', 'Gray Flycatcher'] 

Enter "Q" to quit
Or "B" to use the example dictionary as your bird scores


 Q


Exiting...
