# Welcome to the Bird Probability Project

In [1]:
import pandas as pd
import numpy as np
import datetime 


In [2]:
# data = pd.read_csv('ebd_US-CO_202001_202312_smp_relMar-2024/ebd_US-CO_202001_202312_smp_relMar-2024.txt', sep = '\t')
# data = data[['COMMON NAME', 'COUNTY', 'COUNTY CODE', 'LATITUDE', 'LONGITUDE', 'OBSERVATION DATE', 'NUMBER OBSERVERS']]
# data.to_csv('bird_data.csv')

data = pd.read_csv('bird_data.csv')

data.head(5)

Unnamed: 0.1,Unnamed: 0,COMMON NAME,COUNTY,COUNTY CODE,LATITUDE,LONGITUDE,OBSERVATION DATE,NUMBER OBSERVERS
0,0,Accipiter sp.,Adams,US-CO-001,39.947339,-104.747209,2020-01-10,1.0
1,1,American Crow,Adams,US-CO-001,39.797274,-104.93106,2020-01-01,5.0
2,2,American Crow,Adams,US-CO-001,39.789034,-104.90584,2020-01-02,1.0
3,3,American Crow,Adams,US-CO-001,39.830492,-104.948991,2020-01-01,10.0
4,4,American Crow,Adams,US-CO-001,39.830492,-104.948991,2020-01-01,10.0


In [51]:
counties = data['COUNTY'].unique()
birds = data['COMMON NAME'].unique()

rawDate = datetime.datetime.now()

month = rawDate.month
day = rawDate.day


def getBirds(day, month, plusOrMinus):
    """
    returns a data frame of birds that have been observed from various years during the same day and month
    plus_or_minus so many days.
    
    """
    def distance(date, target):
        '''
        Helper function
        returns the number of days away the date is from the target
        '''
        givenMonth = int(date[5:7])
        targetMonth = int(target[5:7])
        givenDay = int(date[8:])
        targetDay = int(target[8:])
        
        total = abs(givenMonth - targetMonth)*30 #grab the difference in months
        
        if givenMonth > targetMonth:
            total = total - targetDay + givenDay
        elif givenMonth < targetMonth:
            total = total - givenDay + targetDay
        else: 
            total = abs(givenDay - targetDay)
        
        return total
        
     
    if month > 10:
        target = 'xxxx-{}-{}'.format(month,day) if day > 10 else 'xxxx-{}-0{}'.format(month,day)
    else:
        target = 'xxxx-0{}-{}'.format(month,day) if day > 10 else 'xxxx-0{}-0{}'.format(month,day)
        

    # This is not the best implementation, I am just lazy and didn't want to search for the right one (help if you can)
    mask = pd.Series([distance(date, target) <= plusOrMinus for date in data['OBSERVATION DATE']])
        
    return data[mask]


class Item:
    def __init__(self, name, birds, birdCounts = {}):
        self.name = name
        self.features = {}
        self.birdCounts = birdCounts #dictionary of bird name and count 
        
    def getFeatures(self):
        big = max(list(self.birdCounts.values()))
        for bird in self.birdCounts:
            self.features[bird] = self.birdCounts[bird]/big
        
    
class User:
    def __init__(self, ratings, birds):
        self.features = {}
        self.ratings = ratings
        
    def getFeatures(self):
        big = max(list(self.ratings.values()))
        for bird in self.ratings:
            self.features[bird] = self.ratings[bird]/big


def cosSim(user, item):
    itemFeatureVector = []
    for feature in user.features:
        if feature in item.features:
            itemFeatureVector.append(item.features[feature])
        else:
            itemFeatureVector.append(0)
    
    a = list(user.features.values())
    b = itemFeatureVector
    norma = np.linalg.norm(a)
    normb = np.linalg.norm(b)
    
    if norma == 0 or normb == 0:
        return 0
    
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    

### Concern:
Features are going to be the proportion of that bird in the area. Should this be weighted by count? If so that could skew the data so cities with higher populations get better ratings, so probably not. 

In [4]:
seasonal_birds_within_20_days = getBirds(day,month, 10)

unique_birds_in_season = seasonal_birds_within_20_days['COMMON NAME'].unique()

groupByCounty = data.groupby(by = ["COUNTY"])['COMMON NAME'].value_counts()

groupByCounty


COUNTY  COMMON NAME               
Adams   Canada Goose                  19275
        European Starling             18137
        Red-winged Blackbird          17126
        Mallard                       17013
        Red-tailed Hawk               14750
                                      ...  
Yuma    Lesser/Greater Yellowlegs         1
        Loggerhead/Northern Shrike        1
        Fox Sparrow                       1
        Eurasian/American Wigeon          1
        wren sp.                          1
Name: count, Length: 22281, dtype: int64

In [46]:
countyFeatures = []

for county in counties:
    counts = {i: groupByCounty[county][i] for i in groupByCounty[county].index}
    countyFeatures.append(Item(county, birds, counts))


for county in countyFeatures:
    county.getFeatures()

In [53]:
ratings = {'American Crow': 5, "Red-winged Blackbird": 4, 'Red-tailed Hawk': 5}

user = User(ratings, birds)
user.getFeatures()

countyScore = []

for county in countyFeatures:
    countyScore.append((county.name, cosSim(county, user)))
    
countyScore = sorted(countyScore, key = lambda x: x[1], reverse= True)

print('The top county bases on your ratings are: ', [x[0] for x in countyScore[:5]])




The top county bases on your ratings are:  ['Archuleta', 'Routt', 'Eagle', 'Summit', 'Denver']


In [56]:

def getBestLocInCounty(county, user):
    step = 0.01
    currentCounty = seasonal_birds_within_20_days[seasonal_birds_within_20_days['COUNTY'] == county]
    
    minLat = min(currentCounty["LATITUDE"])
    maxLat = max(currentCounty["LATITUDE"])

    minLong = min(currentCounty["LONGITUDE"])
    maxLong = max(currentCounty["LONGITUDE"])
    
    grid = np.array([Item((lat, long), birds) for long in np.arange(minLong, maxLong, step)
                 for lat in np.arange(minLat, maxLat, step)])
    
    scores = []
    
    for item in grid:
        i = 0
        upperLat = item.name[0] + step/2
        lowerLat = item.name[0] - step/2
        upperLong = item.name[1]  + step/2
        lowerLong = item.name[1] - step/2
        
        underLat = currentCounty['LATITUDE'] <= upperLat
        overLat = currentCounty['LATITUDE'] <= lowerLat
        
        
        underLong = currentCounty['LONGITUDE'] <= upperLong
        overLong = currentCounty['LONGITUDE'] <= lowerLong
        
        inLat = np.logical_and(underLat, overLat)
        
        inLong = np.logical_and(underLong, overLong)
        
        inRange = np.logical_and(inLat, inLong)
        
        birdsInRange = currentCounty[inRange]['COMMON NAME'].value_counts()
        
        
        if len(birdsInRange) == 0: continue;
            
        
        counts = {i: birdsInRange[i] for i in birdsInRange.index}
        
        item.birdCounts = counts
        
        item.getFeatures()
        
        scores.append([item.name, cosSim(user, item)])
        
        i += 1
    
    
    scores = sorted(scores, key = lambda x: x[1], reverse = True)
    
    return scores[:5]
    
    
        
        
    
    

getBestLocInCounty('Boulder', User(user, birds))

[[(39.96417829999999, -105.5909427), 0],
 [(39.97417829999999, -105.5909427), 0],
 [(39.98417829999999, -105.5909427), 0],
 [(39.99417829999999, -105.5909427), 0],
 [(40.004178299999985, -105.5909427), 0]]

In [39]:
currentCounty = data[data['COUNTY'] == 'Boulder']

currentCounty[np.logical_and(currentCounty['LATITUDE'] >= 39.961189, currentCounty['LONGITUDE'] >= -105.5)]

Unnamed: 0.1,Unnamed: 0,COMMON NAME,COUNTY,COUNTY CODE,LATITUDE,LONGITUDE,OBSERVATION DATE,NUMBER OBSERVERS
25266,25266,Accipiter sp.,Boulder,US-CO-013,40.209360,-105.280352,2020-01-26,2.0
25267,25267,Accipiter sp.,Boulder,US-CO-013,40.209360,-105.280352,2020-01-26,2.0
25268,25268,Accipiter sp.,Boulder,US-CO-013,40.209360,-105.280352,2020-01-12,11.0
25272,25272,American Crow,Boulder,US-CO-013,40.016368,-105.189128,2020-01-20,1.0
25273,25273,American Crow,Boulder,US-CO-013,40.192098,-105.074788,2020-01-27,2.0
...,...,...,...,...,...,...,...,...
12595209,12595209,Yellow-rumped Warbler,Boulder,US-CO-013,39.967747,-105.231814,2023-12-18,1.0
12595210,12595210,Yellow-rumped Warbler,Boulder,US-CO-013,40.030100,-105.254231,2023-12-09,1.0
12595211,12595211,Yellow-rumped Warbler,Boulder,US-CO-013,40.029662,-105.254252,2023-12-09,2.0
12595212,12595212,Yellow-rumped Warbler,Boulder,US-CO-013,40.029662,-105.254252,2023-12-09,2.0
