# BBS Bird Species Project

## 1. Import Libraries

In [None]:
# pip install geopandas
# pip install fuzzywuzzy
# pip install folium
# pip install python-Levenshtein
# pip install streamlit streamlit-folium

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd

from shapely.geometry import Point

In [None]:
import folium
from folium.plugins import HeatMap       # for more interactive heatmaps

# import fuzzywuzzy
# from fuzzywuzzy import process           # for NLP in the input validation process

## 2. Load and Combine Datasets

In [None]:
birds = pd.read_csv("BBS_bird_dataset.csv")

In [None]:
visits = pd.read_csv("BBS_visit_dataset.csv")

In [None]:
birds.head()

In [None]:
birds['total_obs'] = birds[['section_1', 'section_2', 'section_3', 'section_4', 'section_5', 'section_6', 'section_7', 'section_8', 'section_9', 'section_10']].sum(axis=1)

In [None]:
birds.head()

In [None]:
birds = birds.drop(['section_1', 'section_2', 'section_3', 'section_4', 'section_5', 'section_6', 'section_7', 'section_8', 'section_9', 'section_10'], axis=1)

In [None]:
birds.head()

In [None]:
sightings = pd.merge(birds, visits, on=['year', 'square', 'visit'])

In [None]:
sightings.head()

In [None]:
sightings = sightings.drop(columns = ['parent_square'])

In [None]:
sightings.head()

> **TO DO: I still need to add in the latitude/longitude data and the actual species names**

In [None]:
coordinates = pd.read_csv('grid_square_coordinates_lookup.csv')

In [None]:
sightings = sightings.merge(coordinates[['square', 'ETRS89Lat', 'ETRS89Long']], 
                      on='square',        # grid square as the unique key
                      how='left')         # keep all rows in birds, adding species name

In [None]:
sightings.head()

In [None]:
species = pd.read_csv('species_lookup.csv')

In [None]:
sightings = sightings.merge(species[['species_code', 'English_name']], 
                      on='species_code',       # species code is the key
                      how='left')              # keep all rows in birds, adding species name

In [None]:
sightings.head()

## 3. Preview Data

In [None]:
sightings.head()

In [None]:
sightings.tail()

In [None]:
sightings.info()

In [None]:
sightings.describe().apply(lambda x: x.apply('{0:.3f}'.format))   # display everything to 3 d.p.

## 4. Data Cleaning and Preprocessing

In [None]:
sightings.isnull().sum()

In [None]:
sightings.dtypes

> **NULL HANDLING**
> * distance_band: already has an 'F' and so is a categorical object basically -> can just use 'X' to denote missing data
> * cloud, rain, wind, visibility are numerical with values: 1-3 meaning different levels -> can just fill '0' for no data entered?
> * t1st, t1et, t2st, t2et could all be filled with their mean, mode, or **median**?

In [None]:
print(f'the mean for start time of the first period is:{sightings.t1st.mean()}')
print(f'the median for start time of the first period is:{sightings.t1st.median()}')
print(f'the mode for start time of the first period is:{sightings.t1st.mode()}')

In [None]:
print(f'the mean for end time of the first period is:{sightings.t1et.mean()}')
print(f'the median for end time of the first period is:{sightings.t1et.median()}')
print(f'the mode for end time of the first period is:{sightings.t1et.mode()}')

In [None]:
print(f'the mean for start time of the second period is:{sightings.t2st.mean()}')
print(f'the median for start time of the second period is:{sightings.t2st.median()}')
print(f'the mode for start time of the second period is:{sightings.t2st.mode()}')

In [None]:
print(f'the mean for end time of the second period is:{sightings.t2et.mean()}')
print(f'the median for end time of the second period is:{sightings.t2et.median()}')
print(f'the mode for end time of the second period is:{sightings.t2et.mode()}')

In [None]:
sightings['distance_band'] = sightings['distance_band'].fillna('X')

In [None]:
sightings.isnull().sum()

In [None]:
sightings['cloud'] = sightings['cloud'].fillna(0)
sightings['rain'] = sightings['rain'].fillna(0)
sightings['wind'] = sightings['wind'].fillna(0)
sightings['visibility'] = sightings['visibility'].fillna(0)

In [None]:
sightings.isnull().sum()

In [None]:
## Fill the timings with the median of each period

sightings['t1st'] = sightings['t1st'].fillna(648)
sightings['t1et'] = sightings['t1et'].fillna(735)
sightings['t2st'] = sightings['t2st'].fillna(745)
sightings['t2et'] = sightings['t2et'].fillna(830)

In [None]:
sightings.isnull().sum()

In [None]:
sightings.dtypes

In [None]:
sightings.tail()

In [None]:
sightings['date'] = pd.to_datetime(sightings['date'])

In [None]:
sightings.dtypes

## 5. Exploratory Data Analysis (EDA)

In [None]:
# Count of observations per species
species_counts = sightings['English_name'].value_counts()
print(species_counts)

In [None]:
species_counts[species_counts > 1000]

In [None]:
## Check the species with only 3 or fewer observations

least_common = species_counts[species_counts < 4]

In [None]:
least_common.shape # how many?

In [None]:
least_common.to_frame() # convert to a dataframe (tabular format -> to include in presentation)

In [None]:
common_species = species_counts[species_counts > 200000]       # choose only the most popular species

plt.figure(figsize=(12,6))
sns.barplot(x = common_species.index, y = common_species.values, color = 'darkorange')

plt.xticks(rotation=90) # rotate labels so they are readable
plt.title('Species Abundance')
plt.xlabel('Species')
plt.ylabel('Count')

plt.show()

## 6. Spatial/Geographical Analysis Preparation

In [None]:
sightings.head()

In [None]:
# Convert pandas DataFrame to GeoDataFrame

gdf = gpd.GeoDataFrame(
    sightings,
    geometry=gpd.points_from_xy(sightings.ETRS89Long, sightings.ETRS89Lat),       # create a geometry column from the longitude and latitude
    crs="EPSG:4326"                                                               # this is the standard coordinate system for latitude/longitude coordinates
)

In [None]:
world_map = gpd.read_file("https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip")      # read in a geometry shape file of country outlines

In [None]:
uk = world_map[world_map.ADMIN == 'United Kingdom']       # create a mask for only the UK map

## 7. Visualise Species Distribution with Maps

In [None]:
## Define a function to plot all sighting locations for a given species
## where the function input is the species name as a string
## function output will be be the uk map with the sighting locations of that species

def species_locations_map(species_name: str):

    # plot a UK base map
    uk_map = uk.plot(color='white', edgecolor='black')
    
    gdf[gdf['English_name'] == species_name].plot(ax=uk_map, color='darkorange')
    plt.show()
    return

In [None]:
species_locations_map('Woodlark')

In [None]:
species_locations_map('Bar-tailed Godwit')

In [None]:
## Define a function to plot a folium heatmap for a given species

def species_heatmap(species_name: str):
 
    # Find the subset of the data for the given species
    species_df = gdf[gdf['English_name'] == species_name]
    
    # Create a base map object centered around mean lat/lon
    m = folium.Map(location = [species_df['ETRS89Lat'].mean(), species_df['ETRS89Long'].mean()], zoom_start=6)

    # Create list of [lat, lon] for heatmap points
    heat_data = species_df[['ETRS89Lat', 'ETRS89Long']].values.tolist()
    
    # Add heatmap layer
    HeatMap(heat_data).add_to(m)
    
    # Return/display the map
    return m

In [None]:
species_heatmap('Arctic Tern')

## 8. Observations by Region

![BBS Regions](region_map.png)

In [None]:
sightings.head()

In [None]:
sightings['area_code'] = sightings['square'].str[:2]   # the first two letters of the 'square' column indicate the map area

In [None]:
sightings['region'] = sightings['square'].str[:1]      # the first letter of the 'square' column indicates the broader map region (S: South of England and Wales / W: Channel Islands etc.)

In [None]:
sightings.head()

In [None]:
region_counts = sightings.groupby('region')['English_name'].count()    # species richness by region

In [None]:
plt.figure(figsize=(10,5))

region_counts.plot(kind='bar')
plt.title('Number of Observations by Region')
plt.xlabel('Region')
plt.ylabel('Observation Count')

plt.show()

In [None]:
specific_area_counts = sightings.groupby('area_code')['English_name'].count()

specific_area_counts = specific_area_counts.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,5))

specific_area_counts.plot(kind='bar')
plt.title('Number of Observations by Area')
plt.xlabel('Area Code')
plt.ylabel('Observation Count')

plt.show()

## 9. Species Numbers over Time

In [None]:
sightings.head()

In [None]:
prevalence = sightings.copy()

In [None]:
## Drop irrelevant columns for this purpose

prevalence = prevalence.drop(columns = ['square', 'visit', 'distance_band', 'date', 'observer', 'cloud', 'rain', 'wind', 'visibility', 't1st', 't1et', 't2st', 't2et', 'ETRS89Lat', 'ETRS89Long', 'region'])

In [None]:
prevalence.head()

In [None]:
## Groupby year and species to see how many of each species were sighted in total during each year...

species_counts_over_time = prevalence.groupby(by = ['year', 'species_code', 'English_name'])['total_obs'].sum().reset_index()

In [None]:
species_counts_over_time.head()

In [None]:
## Define a function to produce a time series graph for a given species (where input is the species name)

def species_time_series(species_name):
    temp_df = species_counts_over_time.copy()

    if species_name in temp_df['English_name'].values:
        # Find the subset of the data for the given species
        temp_species_info = temp_df[temp_df['English_name'] == species_name].copy()

        # Calculate 5-year moving average on the 'value' column
        temp_species_info['5y_average'] = temp_species_info['total_obs'].rolling(window=5, min_periods=1).mean()

        # Plot original values and smoothed values
        sns.lineplot(x='year', y='total_obs', data=temp_species_info, label='Original')
        sns.lineplot(x='year', y='5y_average', data=temp_species_info, label='5-Year Average')
    
        # sns.lineplot(x = 'year', y = 'total_obs', data = temp_species_info, marker = 'o')
        plt.xlabel('Year')
        plt.ylabel('Number Observed')
        plt.title(f'{species_name} Observations over time')
        plt.legend()
        plt.grid(True)
        plt.show()

    else: print(f'Species name: "{species_name}" is not in the dataset.')
    
    return

In [None]:
species_time_series('Hooded Crow')

## Streamlit App

[BirdApp](https://birddataproject-kytphxsdcneks7odpinzss.streamlit.app/)