##Libraries

In [None]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [None]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

In [None]:
import tensorflow
device_name = tensorflow.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
print("TensorFlow version:", tensorflow.__version__)

Found GPU at: /device:GPU:0
TensorFlow version: 2.8.0


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Apr 14 01:41:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    35W / 250W |    375MiB / 16280MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


##Dictionaries, Functions

In [None]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
}

In [None]:
# Create df of null_names, null_counts, null_proportion, and null_dtypes
def null_summary(df):
  var_null_list = df.columns[df.isna().any()]
  var_null_names = df[var_null_list].dtypes.index
  var_null_counts = df[var_null_list].isna().sum()
  var_null_proportion = df[var_null_list].isna().sum()/df[var_null_list].isna().count()
  var_null_dtypes = df[var_null_list].dtypes
  var_null_df = pd.DataFrame({'null_names' : var_null_names, 'null_counts' : var_null_counts, 'null_proportion' : var_null_proportion, 'null_dtypes' : var_null_dtypes})
  
  return var_null_df.reset_index(drop=True)

##Map

In [None]:
pfwc = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/weather_data/pfwc.csv")
pfwc = pfwc.drop(columns = 'Unnamed: 0')

In [None]:
locs = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/weather_data/locs.csv")
locs = locs.rename(columns = {'Unnamed: 0':'id'})

In [None]:
ruc = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/urban_rural_pop/ruralurbancodes2013_mod.csv")
ruc.shape

(2489, 4)

In [None]:
uic = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/urban_rural_pop/UrbanInfluenceCodes2013_mod.csv")
uic.shape

(2489, 4)

Import the population estimates by year and merge them.

In [None]:
pop_01 = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/urban_rural_pop/co-est2015-2019-alldata_mod.csv")
pop_01 = pop_01.loc[pop_01[pop_01['state'] != pop_01['county']].index, :]
pop_01.shape

(2582, 7)

In [None]:
pop_02 = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/urban_rural_pop/co-est2020-2021-alldata_mod.csv")
pop_02 = pop_02.loc[pop_02[pop_02['state'] != pop_02['county']].index, :]
pop_02.shape

(2582, 4)

In [None]:
pop = pop_01.merge(pop_02, how='left', on=['state', 'county'])
pop.shape

(2582, 9)

In [None]:
pop = pop.replace({"state": us_state_to_abbrev})
pop = pop.drop_duplicates()

In [None]:
pop.shape

(2582, 9)

In [None]:
cnty = pd.read_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/urban_rural_pop/county_lat_lon_mod.csv")
cnty.shape

(2487, 8)

In [None]:
ruic = ruc.merge(uic[['fips', 'uic_2013']], how='left', on=['fips'])

In [None]:
rup = ruic.merge(pop, how='left', on=['state', 'county'])

In [None]:
null_summary(rup)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes
0,pop_est_2015,8,0.003214,float64
1,pop_est_2016,8,0.003214,float64
2,pop_est_2017,8,0.003214,float64
3,pop_est_2018,8,0.003214,float64
4,pop_est_2019,8,0.003214,float64
5,pop_est_2020,8,0.003214,float64
6,pop_est_2021,8,0.003214,float64


In [None]:
rup = rup.dropna()

In [None]:
rupcy = rup.merge(cnty[['fips', 'land_area', 'water_area', 'total_area', 'latitude', 'longitude']], how='left', on=['fips'])
rupcy.shape

(2481, 17)

In [None]:
null_summary(rupcy)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes
0,land_area,2,0.000806,float64
1,water_area,2,0.000806,float64
2,total_area,2,0.000806,float64
3,latitude,2,0.000806,float64
4,longitude,2,0.000806,float64


In [None]:
rupcy = rupcy.dropna()

In [None]:
rupcy.head(3)

Unnamed: 0,fips,state,county,ruc_2013,uic_2013,pop_est_2015,pop_est_2016,pop_est_2017,pop_est_2018,pop_est_2019,pop_est_2020,pop_est_2021,land_area,water_area,total_area,latitude,longitude
0,1001,AL,Autauga,2,2,54903.0,55302.0,55448.0,55533.0,55769.0,58877.0,59095.0,1539.58,25.78,1565.36,32.54,-86.64
1,1003,AL,Baldwin,3,2,203101.0,207787.0,212737.0,218071.0,223565.0,233140.0,239294.0,4117.52,1133.19,5250.71,30.66,-87.75
2,1005,AL,Barbour,6,6,26300.0,25828.0,25169.0,24887.0,24657.0,25180.0,24964.0,2291.82,50.87,2342.68,31.87,-85.41


Rearrange the df such that the population estimate is one column and year is another column. Save it as a new df.

In [None]:
# Get the common columns across all years
common = [k for k in list(rupcy.columns) if 'pop_est' not in k]

df_list = []

# Loop through the years, creating a new year column, and concat all years in the end
for colname in list(rupcy.filter(regex='pop_est')):
    df = rupcy[common + [colname]]
    df['year'] = colname[-4:]
    df = df.rename(columns={colname : 'pop_est'})
    df_list.append(df)

rupyrs = pd.concat(df_list, axis=0, ignore_index=True)
rupyrs["year"] = pd.to_numeric(rupyrs["year"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
rupyrs.head(3)

Unnamed: 0,fips,state,county,ruc_2013,uic_2013,land_area,water_area,total_area,latitude,longitude,pop_est,year
0,1001,AL,Autauga,2,2,1539.58,25.78,1565.36,32.54,-86.64,54903.0,2015
1,1003,AL,Baldwin,3,2,4117.52,1133.19,5250.71,30.66,-87.75,203101.0,2015
2,1005,AL,Barbour,6,6,2291.82,50.87,2342.68,31.87,-85.41,26300.0,2015


In [None]:
rupyrs.tail(3)

Unnamed: 0,fips,state,county,ruc_2013,uic_2013,land_area,water_area,total_area,latitude,longitude,pop_est,year
17350,55137,WI,Waushara,6,6,1621.73,29.15,1650.88,44.11,-89.24,24828.0,2021
17351,55139,WI,Winnebago,3,2,1125.32,373.18,1498.49,44.09,-88.67,171623.0,2021
17352,55141,WI,Wood,4,5,2054.16,42.28,2096.44,44.46,-90.04,74070.0,2021


In [None]:
# Use rupcy to do the location mapping
mplocs = rupcy[['latitude', 'longitude']]
mplocs['id'] = np.arange(0, mplocs.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
def dist(lat1, long1, lat2, long2):

    # Convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    # Haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [None]:
def find_nearest(dfm, lat, long):
    distances = dfm.apply(lambda row: dist(lat, long, row['latitude'], row['longitude']), axis=1)
    return dfm.loc[distances.idxmin(), 'id']

In [None]:
locs['id2'] = locs.apply(lambda row: find_nearest(mplocs, row['latitude'], row['longitude']), axis=1)

In [None]:
null_summary(locs)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes


##Merge

In [None]:
mplocs = mplocs.rename(columns={'id': 'id2'})
mplocs = mplocs.rename(columns={'latitude': 'newlat'})
mplocs = mplocs.rename(columns={'longitude': 'newlon'})

In [None]:
popland = locs.merge(mplocs, how='left', on=['id2'])

In [None]:
pfwd = pfwc.merge(popland[['latitude', 'longitude', 'newlat', 'newlon']], how='left', on=['latitude', 'longitude'])

In [None]:
rupyrs = rupyrs.rename(columns={'latitude': 'newlat'})
rupyrs = rupyrs.rename(columns={'longitude': 'newlon'})

In [None]:
pfwe = pfwd.merge(rupyrs[['state', 'ruc_2013', 'uic_2013', 'pop_est',
                         'land_area', 'water_area', 'total_area', 'newlat', 'newlon', 'year']],
                  how='left', on=['newlat', 'newlon', 'year'])

In [None]:
pfwe = pfwe.drop(columns = ['newlat', 'newlon', 'state_y'])
pfwe = pfwe.rename(columns={'state_x': 'state'})

In [None]:
print(pfwc.shape[0])
print(pfwe.shape[0])

304512
304512


In [None]:
null_summary(pfwe)

Unnamed: 0,null_names,null_counts,null_proportion,null_dtypes


Remove total_area because it is simply the sum of land_area and water_area.

In [None]:
pfwe = pfwe.drop(columns = 'total_area')

Write out the csv.

In [None]:
pfwe.to_csv("/content/drive/MyDrive/Modules/Module_30_Capstone_4_Final_project/urban_rural_pop/urban_rural_pop.csv")