##Libraries

In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

In [None]:
import tensorflow
device_name = tensorflow.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
print("TensorFlow version:", tensorflow.__version__)

Found GPU at: /device:GPU:0
TensorFlow version: 2.8.2


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Jun 16 01:09:10 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    38W / 300W |    471MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


## Functions

In [None]:
# Create df of null_names, null_counts, null_proportion, and null_dtypes
def null_summary(df):
  var_null_list = df.columns[df.isna().any()]
  var_null_names = df[var_null_list].dtypes.index
  var_null_counts = df[var_null_list].isna().sum()
  var_null_proportion = df[var_null_list].isna().sum()/df[var_null_list].isna().count()
  var_null_dtypes = df[var_null_list].dtypes
  var_null_df = pd.DataFrame({'null_names' : var_null_names, 'null_counts' : var_null_counts, 'null_proportion' : var_null_proportion, 'null_dtypes' : var_null_dtypes})
  
  return var_null_df.reset_index(drop=True)

##Map

In [None]:
fp = '/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/weather_data/'

In [None]:
pfw_prepped = pd.read_csv(fp + "pfw_prepped.csv")
pfw_prepped = pfw_prepped.drop(columns = 'Unnamed: 0')

In [None]:
locs = pd.read_csv(fp + "locs.csv")
locs = locs.rename(columns = {'Unnamed: 0':'id'})

In [None]:
# Use tmin to do the location mapping
tmin = pd.read_csv(fp + "tmin.csv")
mplocs = tmin.groupby(['latitude', 'longitude']).count().reset_index()[['latitude', 'longitude']]
mplocs['id'] = np.arange(0, mplocs.shape[0])

In [None]:
mplocs.to_csv(fp + "tmin_locs.csv")

In [None]:
def quarters(df):

    minlat = df['latitude'].min()
    maxlat = df['latitude'].max()
    midlat = (maxlat + minlat)/2

    minlon = df['longitude'].min()
    maxlon = df['longitude'].max()
    midlon = (maxlon + minlon)/2

    q1 = df[(df['latitude'] > midlat) & (df['latitude'] <= maxlat) & (df['longitude'] >= minlon) & (df['longitude'] < midlon)]
    q2 = df[(df['latitude'] > midlat) & (df['latitude'] <= maxlat) & (df['longitude'] >= midlon) & (df['longitude'] <= maxlon)]

    q3 = df[(df['latitude'] >= minlat) & (df['latitude'] <= midlat) & (df['longitude'] >= minlon) & (df['longitude'] < midlon)]
    q4 = df[(df['latitude'] >= minlat) & (df['latitude'] <= midlat) & (df['longitude'] >= midlon) & (df['longitude'] <= maxlon)]

    return [q1, q2, q3, q4]

In [None]:
locsl = quarters(locs)
mplocsl = quarters(mplocs)

In [None]:
def dist(lat1, long1, lat2, long2):

    # Convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    # Haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [None]:
def find_nearest(dfm, lat, long):
    distances = dfm.apply(lambda row: dist(lat, long, row['latitude'], row['longitude']), axis=1)
    return dfm.loc[distances.idxmin(), 'id']

In [None]:
for idx, jj in enumerate(locsl):
    print(idx)
    locsl[idx]['id2'] = locsl[idx].apply(lambda row: find_nearest(mplocsl[idx], row['latitude'], row['longitude']), axis=1)

newlocs = pd.concat(locsl, ignore_index=True)

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


1
2
3


In [None]:
newlocs.to_csv(fp + "newlocs.csv")

In [None]:
mplocs = pd.read_csv(fp + "tmin_locs.csv")
newlocs = pd.read_csv(fp + "newlocs.csv")

In [None]:
null_summary(newlocs)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes


##Merge the mapped locations into the pfw df

In [None]:
mplocs = mplocs.rename(columns={'id': 'id2'})
mplocs = mplocs.rename(columns={'latitude': 'newlat'})
mplocs = mplocs.rename(columns={'longitude': 'newlon'})

In [None]:
wthr = newlocs.merge(mplocs, how='left', on=['id2'])

In [None]:
pfw_tmin = pfw_prepped.merge(wthr[['latitude', 'longitude', 'newlat', 'newlon']], how='left', on=['latitude', 'longitude'])

In [None]:
# Set the years to do the next merges in chunks by year
ystart = 2015
yend = 2021
years = list(np.arange(ystart, yend+1, 1))
years

[2015, 2016, 2017, 2018, 2019, 2020, 2021]

##Tmin

In [None]:
tmin = tmin.rename(columns={'latitude': 'newlat'})
tmin = tmin.rename(columns={'longitude': 'newlon'})
tmin = tmin.rename(columns={'mint': 'tmin'})

In [None]:
fn = 'pfw_tmin.csv'

In [None]:
wthr = tmin
elem = 'tmin'
bb = pfw_tmin

year = years[0]
wthryear = wthr[wthr['year'] == year]
byear = bb[bb['year'] == year]
merg = byear.merge(wthryear[['newlat', 'newlon', 'month', 'day', 'year', elem]],
                how='left', on=['newlat', 'newlon', 'month', 'day', 'year'])
  
for year in years[1:]:
    wthryear = wthr[wthr['year'] == year]
    byear = bb[bb['year'] == year]
    df = byear.merge(wthryear[['newlat', 'newlon', 'month', 'day', 'year', elem]],
                    how='left', on=['newlat', 'newlon', 'month', 'day', 'year'])
    merg = pd.concat([merg, df], ignore_index=True)

merg.to_csv(fp + fn)

In [None]:
null_summary(merg)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes


##Tmax

In [None]:
pfw_both_t = pd.read_csv(fp + fn)

In [None]:
tmax = pd.read_csv(fp + "tmax.csv")

In [None]:
tmax = tmax.rename(columns={'latitude': 'newlat'})
tmax = tmax.rename(columns={'longitude': 'newlon'})
tmax = tmax.rename(columns={'maxt': 'tmax'})

In [None]:
fn = 'pfw_tmin_tmax.csv'

In [None]:
wthr = tmax
elem = 'tmax'
bb = pfw_both_t

year = years[0]
wthryear = wthr[wthr['year'] == year]
byear = bb[bb['year'] == year]
merg = byear.merge(wthryear[['newlat', 'newlon', 'month', 'day', 'year', elem]],
                how='left', on=['newlat', 'newlon', 'month', 'day', 'year'])
  
for year in years[1:]:
    wthryear = wthr[wthr['year'] == year]
    byear = bb[bb['year'] == year]
    df = byear.merge(wthryear[['newlat', 'newlon', 'month', 'day', 'year', elem]],
                    how='left', on=['newlat', 'newlon', 'month', 'day', 'year'])
    merg = pd.concat([merg, df], ignore_index=True)

merg.to_csv(fp + fn)

In [None]:
null_summary(merg)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes


##Precipitation

In [None]:
pfw_wthr = pd.read_csv(fp + fn)

In [None]:
pcpn = pd.read_csv(fp + "pcpn.csv")

In [None]:
pcpn = pcpn.rename(columns={'latitude': 'newlat'})
pcpn = pcpn.rename(columns={'longitude': 'newlon'})

In [None]:
fn = 'pfw_tmin_tmax_pcpn.csv'

In [None]:
wthr = pcpn
elem = 'pcpn'
bb = pfw_wthr

year = years[0]
wthryear = wthr[wthr['year'] == year]
byear = bb[bb['year'] == year]
merg = byear.merge(wthryear[['newlat', 'newlon', 'month', 'day', 'year', elem]],
                how='left', on=['newlat', 'newlon', 'month', 'day', 'year'])
  
for year in years[1:]:
    wthryear = wthr[wthr['year'] == year]
    byear = bb[bb['year'] == year]
    df = byear.merge(wthryear[['newlat', 'newlon', 'month', 'day', 'year', elem]],
                    how='left', on=['newlat', 'newlon', 'month', 'day', 'year'])
    merg = pd.concat([merg, df], ignore_index=True)

merg.to_csv(fp + fn)

In [None]:
null_summary(merg)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes


##Merge with population and rural-urban data

In [None]:
pfw_wpru = pd.read_csv(fp + fn)

Remove the extra columns originating from the merges, and remove the pfw columns that are redundant with the newly aggregated population and rural-urban data.

In [None]:
rem = ['newlat', 'newlon', 'Unnamed: 0', 'Unnamed: 0.1']
pfw_wpru = pfw_wpru.drop(columns = rem)

Now merge with the population and rural-urban data.

In [None]:
popru = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/urban_rural_pop/urban_rural_pop.csv")

In [None]:
birdy = popru.merge(pfw_wpru[['sub_id', 'loc_id', 'latitude', 'longitude', 'tmin', 'tmax', 'pcpn']], how='left', on=['sub_id', 'loc_id', 'latitude', 'longitude'])
birdy = birdy.drop(columns = 'Unnamed: 0')

In [None]:
null_summary(birdy)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes


In [None]:
birdy = birdy.dropna()

In [None]:
birdy.shape

(146530, 123)

In [None]:
birdy.head()

Unnamed: 0,sub_id,loc_id,latitude,longitude,month,day,year,proj_period_id,valid,reviewed,...,numfeeders_other,count_area_size_sq_m_atleast,ruc_2013,uic_2013,pop_est,land_area,water_area,tmin,tmax,pcpn
0,S61429106,L10011987,43.075295,-74.3421,11,12,2019,PFW_2020,1,0,...,0.0,375.01,4,5,53336.0,1283.26,96.89,23,37,0.26
1,S61625434,L10011987,43.075295,-74.3421,11,19,2019,PFW_2020,1,0,...,0.0,375.01,4,5,53336.0,1283.26,96.89,29,38,0.87
2,S61839137,L10011987,43.075295,-74.3421,11,26,2019,PFW_2020,1,0,...,0.0,375.01,4,5,53336.0,1283.26,96.89,32,47,0.0
3,S61985251,L10011987,43.075295,-74.3421,12,3,2019,PFW_2020,1,0,...,0.0,375.01,4,5,53336.0,1283.26,96.89,19,31,0.33
4,S62233049,L10011987,43.075295,-74.3421,12,10,2019,PFW_2020,1,0,...,0.0,375.01,4,5,53336.0,1283.26,96.89,32,46,0.65


In [None]:
birdy.to_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/aggregated_data.csv")