In [1]:
import pandas as pd
import numpy as np
import os
import sqlite3

In [2]:

def flowerDataFrame(name, isNative):
    '''
    Will take name of species and if it is native and will transform that into a pandas dataframe that is easy to read.

    @param name: a string of the name of the species that you want to obtain data for
    @param isNative: a string stating whether the species is native to california or not
    '''
    # import the entire csv of the species data into a pandas datafram
    df = pd.read_csv(f"./data/{name}/occurrence.txt", sep='\t', low_memory=False)
    #take only specific columns from the bigger dataframe
    df = df[['scientificName', 'decimalLongitude', 'decimalLatitude', 'month', 'year', 'stateProvince']]

    #add a column stating if it is native or not
    df['native'] = isNative

    #make new column that will drop any unnecessary abbreviations at the end of each name of observation
    df['species'] = df['scientificName'].str[:len(name)]
    #make sure dataframe only includes desired species
    df = df[df['species'] == name]
    #drop now unused column
    df = df.drop(['scientificName'], axis=1)

    #make sure dataframe only includes observations from Califronia
    df = df[(df['stateProvince'] == "California") | (df['stateProvince'] == "Ca")]
    #drop because we know all measurements in entire dataset are in california
    df = df.drop(['stateProvince'], axis=1)
    df = df.dropna()
    df = df.reset_index(drop=True)

    #convert months and years into ints instead of floats 
    df[list(["month", "year"])] = df[list(["month", "year"])].astype(int)

    df = df.rename(columns={'decimalLongitude': 'longitude',
                            'decimalLatitude':  'latitude'})
    df.longitude = df.longitude.round(4)
    df.latitude = df.latitude.round(4)
    return df

In [3]:
lasCal = flowerDataFrame('Lasthenia californica', 'yes')
lasCal

Unnamed: 0,longitude,latitude,month,year,native,species
0,-123.9674,41.8560,5,2017,yes,Lasthenia californica
1,-121.4949,36.1659,4,1968,yes,Lasthenia californica
2,-118.6480,34.0777,3,1959,yes,Lasthenia californica
3,-118.4550,34.0738,4,1932,yes,Lasthenia californica
4,-118.8594,34.0792,3,1941,yes,Lasthenia californica
...,...,...,...,...,...,...
1358,-117.2614,32.9361,4,2010,yes,Lasthenia californica
1359,-121.7808,39.7730,2,2011,yes,Lasthenia californica
1360,-121.1934,37.0918,3,2004,yes,Lasthenia californica
1361,-119.5333,35.0500,4,1998,yes,Lasthenia californica


In [4]:
#native plants
plntgo = flowerDataFrame("Plantago erecta Morris", 'yes')
clrkiP = flowerDataFrame("Clarkia purpurea", 'yes')
clrkiB = flowerDataFrame("Clarkia bottae", 'yes')
chaenc = flowerDataFrame("Chaenactis glabriuscula", 'yes')
amsink = flowerDataFrame("Amsinckia menziesii", 'yes')

#non-native
mdcgoP = flowerDataFrame("Medicago polymorpha", 'no')
cntrea = flowerDataFrame("Centaurea solstitialis", 'no')
euphba = flowerDataFrame("Euphorbia", 'no')
altrna = flowerDataFrame("Alternanthera philoxeroides", 'no')
brssTG = flowerDataFrame("Brassica tournefortii Gouan", 'no')

In [5]:
directory = './data/weather/'
 
d = {}
    
# iterate over files in
# that directory
for filename in os.listdir(directory):
        d[filename] = pd.read_csv(f'./data/weather/{filename}', encoding= 'unicode_escape')

In [6]:
for key, value in d.items():
    value['Year'] = key[0:4]
    value['Month'] = key[5:7]
    
#d.values()

In [7]:
weather_df = pd.concat(d.values(), ignore_index=True)
weather_df = weather_df[weather_df.Year != '.DS_']
weather_df = weather_df.drop('Unnamed: 0', axis =1)
weather_df = weather_df.reset_index(drop=True)


weather_df.columns = weather_df.columns.str.replace(' ','')

list = ['County','ClimateDivision','Longitude','Latitude','Elevation','MeanAvgTemperature','TotalPrecipitation']

for i in list:
    weather_df[i] = weather_df[i].str.replace(' ','')

weather_df = weather_df.drop(weather_df[weather_df['MeanAvgTemperature'] == 'M'].index)
weather_df = weather_df.drop(weather_df[weather_df['ClimateDivision'] == '-'].index)

weather_df.head()

Unnamed: 0,Name,County,ClimateDivision,Longitude,Latitude,Elevation,MeanAvgTemperature,TotalPrecipitation,Year,Month
0,IMPERIAL,Imperial,CA07,-115.567,32.849,-64,65.4,0.0,2008,3
1,EL CAPITAN DAM,SanDiego,CA06,-116.815,32.886,600,55.8,0.14,2008,3
3,POWAY VALLEY,SanDiego,CA06,-117.031,33.02,648,58.3,0.11,2008,3
4,JULIAN CDF,SanDiego,CA06,-116.592,33.076,4215,49.0,0.0,2008,3
5,RAMONA FIRE DEPT,SanDiego,CA06,-116.908,33.011,1470,55.4,0.2,2008,3


In [8]:
weather_df["MeanAvgTemperature"] = weather_df.MeanAvgTemperature.astype(float)
weather_df.dtypes

Name                   object
County                 object
ClimateDivision        object
Longitude              object
Latitude               object
Elevation              object
MeanAvgTemperature    float64
TotalPrecipitation     object
Year                   object
Month                  object
dtype: object

In [9]:
averages = weather_df.groupby(["ClimateDivision", "Year"])[["MeanAvgTemperature"]].mean()
averages

Unnamed: 0_level_0,Unnamed: 1_level_0,MeanAvgTemperature
ClimateDivision,Year,Unnamed: 2_level_1
CA01,2000,54.742616
CA01,2001,55.222086
CA01,2002,55.458410
CA01,2003,55.493068
CA01,2004,55.682616
...,...,...
CA07,2016,66.398460
CA07,2017,66.814381
CA07,2018,66.298094
CA07,2019,63.308152
