In [1]:
#import packages
import pandas as pd
import os
from plotly import express as px
import plotly.graph_objects as go
import geopandas as gpd
import numpy as np

## Import Flowering Data

In [2]:
def flowerDataFrame(name, isNative):
    '''
    Will take name of species and if it is native and will transform that into a pandas dataframe that is easy to read.

    @param name: a string of the name of the species that you want to obtain data for
    @param isNative: a string stating whether the species is native to california or not
    '''
    # import the entire csv of the species data into a pandas dataframe
    df = pd.read_csv(f"./data/{name}/occurrence.txt", sep='\t', low_memory=False)
    #take only specific columns from the bigger dataframe
    df = df[['scientificName', 'decimalLongitude', 'decimalLatitude', 'day', 'month', 'year', 'stateProvince', 'countryCode']]

    #add a column stating if it is native or not
    df['native'] = isNative

    #make new column that will drop any unnecessary abbreviations at the end of each name of observation
    df['species'] = df['scientificName'].str[:len(name)]
    #make sure dataframe only includes desired species
    df = df[df['species'] == name]
    #drop now unused column
    df = df.drop(['scientificName'], axis=1)

    #make sure dataframe only includes observations from Califronia
    df = df[(df['countryCode'] == "US")]
    df = df[(df['stateProvince'] == "California") | (df['stateProvince'] == "Ca")]
    df = df[df['decimalLongitude'] > -125]
    df = df[df['decimalLongitude'] < -113]
    #drop because we know all measurements in entire dataset are in california
    df = df.drop(['stateProvince'], axis=1)
    df = df.drop(['countryCode'], axis=1)
    df = df.dropna()
    df = df.reset_index(drop=True)
    
    #add column called day of year
    df[list(["day", "month", "year"])] = df[list(["day", "month", "year"])].astype(int)
    df[list(["day", "month", "year"])] = df[list(["day", "month", "year"])].astype(str)
    
    df['month'] = df['month'].apply(lambda x: '{0:0>2}'.format(x))
    df['day'] = df['day'].apply(lambda x: '{0:0>2}'.format(x))
    
    df["DOY"] = df["year"].copy()
    month = df["month"].copy()
    day = df["day"].copy()
    
    df["DOY"] = df["DOY"].str.cat((month, day), sep ="-")
    df['DOY'] = pd.to_datetime(df['DOY'], format='%Y/%m/%d')
    df['DOY'] = df['DOY'].dt.dayofyear

    #convert months and years into ints instead of floats 
    df[list(["year"])] = df[list(["year"])].astype(int)
    
    #make sure occurances are of from 1920-2020
    df = df[df['year'] > 1919]
    df = df[df['year'] < 2021]
    df = df.reset_index(drop = True)
    
    #give columns more readable names
    df = df.rename(columns={'decimalLongitude': 'longitude',
                            'decimalLatitude':  'latitude'})
    #keep latitude rounding same as eventual climate data
    df.longitude = df.longitude.round(3)
    df.latitude = df.latitude.round(3)

    
    return df

In [3]:
#native plants
lasCal = flowerDataFrame('Lasthenia californica', 'yes')
plntgo = flowerDataFrame("Plantago erecta Morris", 'yes')
clrkiP = flowerDataFrame("Clarkia purpurea", 'yes')
clrkiB = flowerDataFrame("Clarkia bottae", 'yes')
chaenc = flowerDataFrame("Chaenactis glabriuscula", 'yes')
amsink = flowerDataFrame("Amsinckia menziesii", 'yes')

#non-native
mdcgoP = flowerDataFrame("Medicago polymorpha", 'no')
cntrea = flowerDataFrame("Centaurea solstitialis", 'no')
euphba = flowerDataFrame("Euphorbia", 'no') # E. vigrata
euphba['species'] = 'Euphorbia vigrata' #fix species name 
altrna = flowerDataFrame("Alternanthera philoxeroides", 'no')
brssTG = flowerDataFrame("Brassica tournefortii Gouan", 'no')

In [4]:
#concat all species dataframes into one big one
flowersCA = pd.concat([lasCal, plntgo, clrkiP, clrkiB, chaenc, amsink,
                       mdcgoP, cntrea, euphba, altrna, brssTG])
#convert into geodataframe to be easily compared wihtin shape files in the future
flowersCA = gpd.GeoDataFrame(flowersCA, geometry=gpd.points_from_xy(flowersCA.longitude, flowersCA.latitude))

### Checking if there is an even amount of Native/Non-Native Species

In [5]:
flowersCA_countNative = flowersCA.groupby(['native']).size().to_frame('count')
flowersCA_countNative.head()

Unnamed: 0_level_0,count
native,Unnamed: 1_level_1
no,9067
yes,11017


While it looks about even in total, lets see how that makeup changes by the year.

In [31]:
flowersCA_countByYear = flowersCA.groupby(['year', 'native']).size().to_frame('count')
flowersCA_countByYear = flowersCA_countByYear.reset_index()
flowersCA_countByYear.head()

Unnamed: 0,year,native,count
0,1920,yes,15
1,1921,yes,3
2,1922,yes,4
3,1923,no,4
4,1923,yes,4


In [32]:
#converting to percentage by year to make a visualization.
flowersCA_percentByYear = flowersCA_countByYear.groupby(['year', 'native']).agg({'count': 'sum'})

flowersCA_percentByYear = flowersCA_percentByYear.groupby(level=0).apply(lambda x:
                                                                 round(100 * x / float(x.sum()), 2))
flowersCA_percentByYear = flowersCA_percentByYear.rename(columns={'count': 'percent'})
flowersCA_percentByYear = flowersCA_percentByYear.reset_index()
flowersCA_percentByYear.head()

Unnamed: 0,year,native,percent
0,1920,yes,100.0
1,1921,yes,100.0
2,1922,yes,100.0
3,1923,no,50.0
4,1923,yes,50.0


In [33]:
#visualization from data
fig = go.Figure()
fig.add_trace(go.Scatter(x    = flowersCA_percentByYear[flowersCA_percentByYear['native'] == 'no']['year'],
                         y    = flowersCA_percentByYear[flowersCA_percentByYear['native'] == 'no']['percent'],
                         mode = 'lines+markers',
                         name = 'Non-Native'))

fig.add_trace(go.Scatter(x    = flowersCA_percentByYear[flowersCA_percentByYear['native'] == 'yes']['year'],
                         y    = flowersCA_percentByYear[flowersCA_percentByYear['native'] == 'yes']['percent'],
                         mode = 'lines+markers',
                         name = 'Native'))

fig.update_layout(title='% Native vs Non-Native Californian Plants from 1920-2020',
                   xaxis_title='Year',
                   yaxis_title='Percentage of plant species')
fig.show()

In [36]:
fig = px.scatter(flowersCA_countByYear,
                 x = "year",
                 y = "count",
                 color = 'native')

fig.show()

Something to note is that though there seems to be in total an even distribution, the percentage between the two vary widely between the years. I did not do anything more with this info, I just thought it was interesting to note.

## Climate Division Map

Below I take a shape file from from NOAAs database, and determine which climate divisions each observation falls under.

In [9]:
#from NOAAs shapefile, find useful columns and determine which are within CA
climateDivs = gpd.read_file("./data/climateDiv/GIS.OFFICIAL_CLIM_DIVISIONS.shp")
climateDivs = climateDivs[climateDivs["ST_ABBRV"] == 'CA']
climateDivs = climateDivs[["CD_NEW", 'geometry']]

In [10]:
#set both coordinate reference systems to be the same so the dataframes are accurately compared
climateDivs = climateDivs.set_crs("EPSG:4326", allow_override=True)
flowersCA = flowersCA.set_crs("EPSG:4326")

In [11]:
#this will join the two datasets based on if flowering observation
# geometry points are within the climate division shapefiles, 
# and add a column within each observation of said Climate Divsion
flowersCA = gpd.sjoin(flowersCA, climateDivs, how='inner', predicate='within')
flowersCA = flowersCA.drop(['index_right'], axis = 1)
flowersCA = flowersCA.reset_index(drop = True)
flowersCA = flowersCA.rename(columns={'CD_NEW' : 'ClimateDivision'})
#write only necessary info to csv for quick web parsing
flowersCA.to_csv('./data/flowersCA.csv')

## Obtain Weather Data

In [12]:
#directory in which all weather data presides
directory = './data/weather/'

#empty dictionary ready to be filled up
d = {}
    
# iterate over files in this directory, add each name of file and its contents into a dictionary
for filename in os.listdir(directory):
        d[filename] = pd.read_csv(f'./data/weather/{filename}', encoding= 'unicode_escape')

In [13]:
#add the month and year to each file based off its name
for key, value in d.items():
    value['Year'] = key[0:4]
    value['Month'] = key[5:7]

In [14]:
#put this dictionary into a clean dataframe

#drop uneccesary columns
weather_df = pd.concat(d.values(), ignore_index=True)
weather_df = weather_df[weather_df.Year != '.DS_']
weather_df = weather_df.drop('Unnamed: 0', axis =1)
weather_df = weather_df.reset_index(drop=True)

#nospacesforeasyreadability
weather_df.columns = weather_df.columns.str.replace(' ','')

list = ['County','ClimateDivision','Longitude','Latitude','Elevation','MeanAvgTemperature','TotalPrecipitation']

for i in list:
    weather_df[i] = weather_df[i].str.replace(' ','')

#drop rows with unreadable values
weather_df = weather_df.drop(weather_df[weather_df['MeanAvgTemperature'] == 'M'].index)
weather_df = weather_df.drop(weather_df[weather_df['ClimateDivision'] == '-'].index)
weather_df = weather_df.drop(weather_df[weather_df['TotalPrecipitation'] == 'T'].index)
weather_df = weather_df.drop(weather_df[weather_df['TotalPrecipitation'] == 'M'].index)

#turn values into floats for easy computations
weather_df["MeanAvgTemperature"] = weather_df.MeanAvgTemperature.astype(float)
weather_df["TotalPrecipitation"] = weather_df.TotalPrecipitation.astype(float)
weather_df["Latitude"] = weather_df.Latitude.astype(float)
weather_df["Longitude"] = weather_df.Longitude.astype(float)

#round coordinates to three to accurately compare to flower observations
weather_df.Longitude = weather_df.Longitude.round(3)
weather_df.Latitude = weather_df.Latitude.round(3)

#drop the zero in front of each climate division
weather_df.ClimateDivision = weather_df.ClimateDivision.str[-1]

weather_df

Unnamed: 0,Name,County,ClimateDivision,Longitude,Latitude,Elevation,MeanAvgTemperature,TotalPrecipitation,Year,Month
0,IMPERIAL,Imperial,7,-115.567,32.849,-64,65.4,0.00,2008,03
1,EL CAPITAN DAM,SanDiego,6,-116.815,32.886,600,55.8,0.14,2008,03
3,POWAY VALLEY,SanDiego,6,-117.031,33.020,648,58.3,0.11,2008,03
4,JULIAN CDF,SanDiego,6,-116.592,33.076,4215,49.0,0.00,2008,03
5,RAMONA FIRE DEPT,SanDiego,6,-116.908,33.011,1470,55.4,0.20,2008,03
...,...,...,...,...,...,...,...,...,...,...
277124,CSS LAB,Nevada,2,-120.370,39.330,6855,26.7,10.70,2001,02
277125,RUBICON #2,ElDorado,3,-120.130,39.000,7689,25.8,6.00,2001,02
277126,ECHO PEAK,ElDorado,3,-120.080,38.850,7670,27.6,7.50,2001,02
277127,FALLEN LEAF,ElDorado,3,-120.050,38.930,6236,28.6,3.40,2001,02


In [15]:
#find mean temperature and mean monthly precipitation by weather station
averages = weather_df.groupby(["ClimateDivision", "Year"])[["MeanAvgTemperature", "TotalPrecipitation"]].mean().round(2)
averages = averages.reset_index()
averages = averages.rename(columns={'TotalPrecipitation': 'AvgMonthlyPrecipitation'})
#write only necessary info to a file for quick web parsing
averages.to_csv('./data/averages.csv')

## Average California Temperature by Year Functions

In [16]:
def divScatterTemp (climatediv = False):
    '''
    Returns a figure of the average temperature in California from 1920-2020 
    based on the users climate division preference
    
    @param climatediv: the climate division the user prefers to see, the default is all climate divisions in the state
    @return fig: a figure of said climate division's mean temperature over the years
    '''
    
    #if climate division specified only take data from that division  
    if climatediv:
        avgDf = averages[averages['ClimateDivision'] == climatediv]
        title = f'Average Temperature of CA0{climatediv} California from 1920-2020'
        
    else:
        avgDf = averages
        title = 'Average Temperature of California from 1920-2020'
        
    fig = px.scatter(avgDf,
                     x = "Year",
                     y = "MeanAvgTemperature",
                     trendline = "ols",
                     color = 'ClimateDivision',
                     title = title)
    return fig

In [17]:
divScatterTemp(climatediv = '3').show()

In [18]:
def divScatterPrecip (climatediv = False):
    
    '''
    Returns a figure of the average monthly precipitation in Californian weather stations from 1920-2020 
    based on the users climate division preference
    
    @param climatediv: the climate division the user prefers to see, the default is all climate divisions in the state
    @return fig: a figure of said climate division's mean monthly precipitation over the years
    '''
        
    if climatediv:
        avgDf = averages[averages['ClimateDivision'] == climatediv]
        title = f'Average Monthly Precipitation of CA0{climatediv} California from 1920-2020'
        
    else:
        avgDf = averages
        title = 'Average Monthly Precipitation of California from 1920-2020'
        
    fig = px.scatter(avgDf,
                     x = "Year",
                     y = "AvgMonthlyPrecipitation",
                     trendline = "ols",
                     color = 'ClimateDivision',
                     title = title)
    return fig

In [19]:
divScatterPrecip(climatediv = '6').show()

## Phenology Change of DOY

Below are the two methods I used in a later function to have easy ways to get the average flowering day of year if necessary

In [20]:
#will give the mean flowering day of year for each species in each climate division for each year
avgPhenologyDiv = flowersCA.groupby(['species', 'year', 'native', 'ClimateDivision'])[["DOY"]].mean().reset_index().round(0)
avgPhenologyDiv

Unnamed: 0,species,year,native,ClimateDivision,DOY
0,Alternanthera philoxeroides,1946,no,6,221.0
1,Alternanthera philoxeroides,1956,no,6,285.0
2,Alternanthera philoxeroides,1960,no,6,239.0
3,Alternanthera philoxeroides,1963,no,6,164.0
4,Alternanthera philoxeroides,1965,no,6,273.0
...,...,...,...,...,...
2441,Plantago erecta Morris,2020,yes,1,92.0
2442,Plantago erecta Morris,2020,yes,2,90.0
2443,Plantago erecta Morris,2020,yes,4,93.0
2444,Plantago erecta Morris,2020,yes,5,98.0


In [21]:
#will give the average flowering day of year of species at each year regardless of climate division
avgPhenology = flowersCA.groupby(['species', 'year', 'native'])[["DOY"]].mean().reset_index().round(0)
avgPhenology

Unnamed: 0,species,year,native,DOY
0,Alternanthera philoxeroides,1946,no,221.0
1,Alternanthera philoxeroides,1956,no,285.0
2,Alternanthera philoxeroides,1960,no,239.0
3,Alternanthera philoxeroides,1963,no,164.0
4,Alternanthera philoxeroides,1965,no,273.0
...,...,...,...,...
843,Plantago erecta Morris,2016,yes,95.0
844,Plantago erecta Morris,2017,yes,102.0
845,Plantago erecta Morris,2018,yes,109.0
846,Plantago erecta Morris,2019,yes,105.0


## Average Flowering by Year Function

In [22]:
def avgFloweringByYear (startYear = 1920, endYear = 2020, native = 'NA', trendline = None, trendline_scope= False, div = False):
    '''
    Will give a vizualization of the average flowering day of years for species 
    of specified parameters based on user input.
    
    @param startYear: the first year wanted in the visualization
    @param endYear: the last year wanted in the visualization
    @param native: whether the species are non native, native, or all. default is all
    @param trendline: the type of trendline for each data point. default is none
    @param trendline_scope: whether or not the trendline should be based off individual species or all. default is individual
    @param div: which climate division to look at. default is all
    '''
    
    #make all numbers ints for comparisons
    startYear = int(startYear)
    endYear = int(endYear)
    div = int(div)
        
    #empty strings to be filled based on conditionals    
    ntv = ''
    divNum = ''
    
    if div:
        #if div is specified, take data from dataframe including climatedivision
        l = ['species', 'year', 'native', 'ClimateDivision']
        #find average flowering day of year
        avg_df = flowersCA.groupby(l)["DOY"].mean().reset_index().round(0)
        #make sure its in specified climate division
        avg_df = avg_df[avg_df['ClimateDivision'] == div]
        #add division to title
        divNum = f'0{div}'
    else:
        #else take average flowering day regardless of climate division
        l = ['species', 'year', 'native']
        avg_df = flowersCA.groupby(l)["DOY"].mean().reset_index().round(0)
        
    if native != 'NA':
        #if native or non native specified, only take data of that type
        avg_df = avg_df[avg_df['native'] == native]
        #ajust title accordingly
        if native == 'yes':
            ntv = "Native "
        else:
            ntv = 'Non-Native '
            
    #make sure observation are of the specified years
    avg_df = avg_df[avg_df['year'] >= startYear]
    avg_df = avg_df[avg_df['year'] <= endYear]
    
    #update title
    title = f"Average {ntv}Flowering Day of Year in CA{divNum} from {startYear}-{endYear}"
    
    #plot
    if trendline_scope:
        fig = px.scatter(avg_df,
                         x = "year",
                         y = "DOY",
                         color = 'species',
                         trendline = trendline,
                         trendline_scope = trendline_scope,
                         title = title)
    else:
        fig = px.scatter(avg_df,
                         x = "year",
                         y = "DOY",
                         color = 'species',
                         trendline = trendline,
                         title = title)
    return fig

In [23]:
avgFloweringByYear(native = 'yes', 
                   trendline = "ols")

## California Flower Species Observations Function

In [24]:
def flowersOverCA (startYear = 1920, endYear = 2020, native = 'NA', div = False):
    '''
    Will give a vizualization of all observations over years for species 
    of specified parameters based on user input.
    
    @param startYear: the first year wanted in the visualization
    @param endYear: the last year wanted in the visualization
    @param native: whether the species are non native, native, or all. default is all@param div: which climate division to look at. default is all
    '''
    
    #make into int for easy comparasons
    div = int(div)
    
    #make copy of dataframe as to not modify original
    flowers = flowersCA
    
    if div:
        #if climate division specified, only get those from there
        flowers = flowers[flowers['ClimateDivision'] == div]

    if native != 'NA':
        #if type of flower specified, only get those of that type
        flowers = flowers[flowers['native'] == native]

    #make sure all observations are within the specified dates
    flowers = flowers[flowers['year'] >= startYear]
    flowers = flowers[flowers['year'] <= endYear]
    
    #plot    
    fig = px.scatter_mapbox(flowers, lat="latitude", lon="longitude",  color="species", zoom=3, height=600)
    
    #make easier to see and center on california
    fig.update_layout(mapbox_style="open-street-map", mapbox_zoom=4, mapbox_center_lat = 41,
        margin={"r":0,"t":0,"l":0,"b":0})

    return fig.show()

In [26]:
flowersOverCA(div = '3')