In [None]:
# Citipy installation
!pip install citipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting citipy
  Downloading citipy-0.0.5.tar.gz (557 kB)
[K     |████████████████████████████████| 557 kB 15.4 MB/s 
[?25hCollecting kdtree>=0.12
  Downloading kdtree-0.16-py2.py3-none-any.whl (7.7 kB)
Building wheels for collected packages: citipy
  Building wheel for citipy (setup.py) ... [?25l[?25hdone
  Created wheel for citipy: filename=citipy-0.0.5-py3-none-any.whl size=559701 sha256=af0ea1132c5453bfd2cac466efb936d07c4c95b602e8233c5eff856e5c73df21
  Stored in directory: /root/.cache/pip/wheels/72/ae/63/5d03cf6dfe5c0e9328fb73f8b30da1948a06b9cceaf2c68d2d
Successfully built citipy
Installing collected packages: kdtree, citipy
Successfully installed citipy-0.0.5 kdtree-0.16


In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
from citipy import citipy
from weather_api_key import weather_api_key

## Generate list of cities

In [None]:
# IMPORTANT: DO NOT RUN THIS CELL UNLESS YOU WANT A FRESH DATASET TO RUN THE
# API CALLS IN THE NEXT CELL

# Range of latitudes and longitudes
lat_range = (-90, 90)
lng_range = (-180, 180)

# List for holding lat_lngs and cities
lat_lngs = []
cities = []

# Create a set of random lat and lng combinations
lats = np.random.uniform(lat_range[0], lat_range[1], size=40000)
lngs = np.random.uniform(lng_range[0], lng_range[1], size=40000)
lat_lngs = zip(lats, lngs)

# Identify nearest city for each lat, lng combination
for lat_lng in lat_lngs:
    city = citipy.nearest_city(lat_lng[0], lat_lng[1]).city_name
    
    # If the city is unique, then add it to a our cities list
    if city not in cities:
        cities.append(city)

# Print the city count to confirm sufficient count
len(cities)

5298

## Perform API calls

In [None]:
# IMPORTANT: DO --NOT-- RUN THIS CELL AGAIN UNLESS YOU WANT TO CONSTRUCT A NEW
# DATAFRAME. THIS QUERY WILL TAKE OVER 4 -HOURS- TO COMPLETE. LAST REMINDER:
# DO -NOT- RUN THIS UNLESS YOU ABSOLUTELY NEED A NEW SET OF RANDOM DATA.

# Base url to modify later for API calls
base_url = "http://history.openweathermap.org/data/2.5/aggregated/year?"

# Empty DataFrame to append rows of city weather data into
weather_df = pd.DataFrame()

# Counter to keep track of records processed
record_counter = 1

# Begin data retreval loop
print("Beginning Data Retrieval")
print("------------------------------------------")
for city in cities:
    # Build query URL (imperial units)
    query = f"{base_url}q={city}&appid={weather_api_key}"

    # Get response for queried city
    response = requests.get(query)
    
    # If 404 error response, print "city was not found."
    # Found out the hard way that the skipped query still counts toward the 
    # query calls per minute...
    if response.status_code == 200:
        print(f"Processing {city}... | {record_counter}/{len(cities)}")

        # Get weather information from response as json object
        weather_info = response.json()

        # For each day in the JSON object...
        for day in weather_info["result"]:
            # Use geocoding API for lat/lon info
            coord_query = f"http://api.openweathermap.org/geo/1.0/direct?q={city}&limit=1&appid={weather_api_key}"
            coord_response = requests.get(coord_query).json()
            # Append weather information of interest into weather_df
            city_weather_info = {
                "month": day["month"],
                "day": day["day"],
                "city": coord_response[0]["name"],
                "longitude": coord_response[0]["lon"],
                "latitude": coord_response[0]["lat"],
                "min_temp_k": day["temp"]["record_min"],
                "max_temp_k": day["temp"]["record_max"],
                "avgmin_temp_k": day["temp"]["average_min"],
                "avgmax_temp_k": day["temp"]["average_max"],
                "median_temp_k": day["temp"]["median"],
                "mean_temp_k": day["temp"]["mean"],
                "p25_temp_k": day["temp"]["p25"],
                "p75_temp_k": day["temp"]["p75"],
                "stdev_temp_k": day["temp"]["st_dev"],
                "min_pres_hpa": day["pressure"]["min"],
                "max_pres_hpa": day["pressure"]["max"],
                "median_pres_hpa": day["pressure"]["median"],
                "mean_pres_hpa": day["pressure"]["mean"],
                "p25_pres_hpa": day["pressure"]["p25"],
                "p75_pres_hpa": day["pressure"]["p75"],
                "stdev_pres_hpa": day["pressure"]["st_dev"],
                "min_humid_%": day["humidity"]["min"],
                "max_humid_%": day["humidity"]["max"],
                "median_humid_%": day["humidity"]["median"],
                "mean_humid_%": day["humidity"]["mean"],
                "p25_humid_%": day["humidity"]["p25"],
                "p75_humid_%": day["humidity"]["p75"],
                "stdev_humid_%": day["humidity"]["st_dev"],
                "min_wind_mps": day["wind"]["min"],
                "max_wind_mps": day["wind"]["max"],
                "median_wind_mps": day["wind"]["median"],
                "mean_wind_mps": day["wind"]["mean"],
                "p25_wind_mps": day["wind"]["p25"],
                "p75_wind_mps": day["wind"]["p75"],
                "stdev_wind_mps": day["wind"]["st_dev"],
                "min_prec_mm": day["precipitation"]["min"],
                "max_prec_mm": day["precipitation"]["max"],
                "median_prec_mm": day["precipitation"]["median"],
                "mean_prec_mm": day["precipitation"]["mean"],
                "p25_prec_mm": day["precipitation"]["p25"],
                "p75_prec_mm": day["precipitation"]["p75"],
                "st_dev_prec_mm": day["precipitation"]["st_dev"],
                "min_cloud_%": day["clouds"]["min"],
                "max_cloud_%": day["clouds"]["max"],
                "median_cloud_%": day["clouds"]["median"],
                "mean_cloud_%": day["clouds"]["mean"],
                "p25_cloud_%": day["clouds"]["p25"],
                "p75_cloud_%": day["clouds"]["p75"],
                "stdev_cloud_%": day["clouds"]["st_dev"],
            }
            # Append to dataframe
            weather_df = weather_df.append(city_weather_info, ignore_index=True)
    else:
        print(f"City not found. Skipping {city}... | {record_counter}/{len(cities)}")

    # Increment record counter before proceeding to the next city
    record_counter += 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
City not found. Skipping nuuk... | 299/5298
City not found. Skipping nikolayevka... | 300/5298
City not found. Skipping ardistan... | 301/5298
Processing campbellsville... | 302/5298
City not found. Skipping gonaives... | 303/5298
City not found. Skipping sokoni... | 304/5298
City not found. Skipping sao felix do xingu... | 305/5298
City not found. Skipping el badari... | 306/5298
Processing moberly... | 307/5298
City not found. Skipping codrington... | 308/5298
Processing saint anthony... | 309/5298
City not found. Skipping olavarria... | 310/5298
City not found. Skipping sakakah... | 311/5298
City not found. Skipping boddam... | 312/5298
City not found. Skipping isangel... | 313/5298
City not found. Skipping namibe... | 314/5298
City not found. Skipping adrar... | 315/5298
City not found. Skipping bayburt... | 316/5298
City not found. Skipping kagalnitskaya... | 317/5298
City not found. Skipping ipua... | 318/5298
Proce

In [None]:
# View dataframe
weather_df

Unnamed: 0,month,day,city,longitude,latitude,min_temp_k,max_temp_k,avgmin_temp_k,avgmax_temp_k,median_temp_k,...,p25_prec_mm,p75_prec_mm,st_dev_prec_mm,min_cloud_%,max_cloud_%,median_cloud_%,mean_cloud_%,p25_cloud_%,p75_cloud_%,stdev_cloud_%
0,1.0,1.0,Fortuna,-124.156034,40.597407,271.88,288.33,276.19,284.41,279.38,...,0.0,0.0,0.16,0.0,90.0,1.0,32.13,1.0,76.0,39.67
1,1.0,2.0,Fortuna,-124.156034,40.597407,272.71,287.15,276.29,284.06,279.78,...,0.0,0.0,0.23,0.0,100.0,1.0,27.79,1.0,75.0,38.28
2,1.0,3.0,Fortuna,-124.156034,40.597407,273.15,289.73,278.65,285.16,281.21,...,0.0,0.0,0.19,0.0,100.0,20.0,40.09,1.0,90.0,39.55
3,1.0,4.0,Fortuna,-124.156034,40.597407,277.15,292.44,280.05,285.61,282.12,...,0.0,0.3,0.35,0.0,100.0,75.0,55.62,2.5,90.0,37.95
4,1.0,5.0,Fortuna,-124.156034,40.597407,271.67,291.44,279.30,285.87,282.50,...,0.0,0.0,0.67,0.0,100.0,73.5,49.83,1.0,90.0,39.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183361,12.0,27.0,Progress,-121.801080,39.063227,265.64,292.42,271.57,280.66,274.16,...,0.0,0.0,0.19,0.0,92.0,31.5,42.43,1.0,90.0,41.84
183362,12.0,28.0,Progress,-121.801080,39.063227,260.97,287.58,272.70,280.75,276.65,...,0.0,0.0,0.33,0.0,97.0,68.0,48.03,1.0,90.0,40.63
183363,12.0,29.0,Progress,-121.801080,39.063227,260.98,289.62,272.62,279.45,275.92,...,0.0,0.0,0.90,0.0,97.0,84.0,62.04,22.0,90.0,35.96
183364,12.0,30.0,Progress,-121.801080,39.063227,262.63,283.24,272.23,277.14,275.14,...,0.0,0.0,0.17,0.0,100.0,75.0,56.03,20.0,90.0,36.48


In [None]:
# Convert months and days to integers
df2 = weather_df.astype({"month": "int", "day": "int"})
df2

Unnamed: 0,month,day,city,longitude,latitude,min_temp_k,max_temp_k,avgmin_temp_k,avgmax_temp_k,median_temp_k,...,p25_prec_mm,p75_prec_mm,st_dev_prec_mm,min_cloud_%,max_cloud_%,median_cloud_%,mean_cloud_%,p25_cloud_%,p75_cloud_%,stdev_cloud_%
0,1,1,Fortuna,-124.156034,40.597407,271.88,288.33,276.19,284.41,279.38,...,0.0,0.0,0.16,0.0,90.0,1.0,32.13,1.0,76.0,39.67
1,1,2,Fortuna,-124.156034,40.597407,272.71,287.15,276.29,284.06,279.78,...,0.0,0.0,0.23,0.0,100.0,1.0,27.79,1.0,75.0,38.28
2,1,3,Fortuna,-124.156034,40.597407,273.15,289.73,278.65,285.16,281.21,...,0.0,0.0,0.19,0.0,100.0,20.0,40.09,1.0,90.0,39.55
3,1,4,Fortuna,-124.156034,40.597407,277.15,292.44,280.05,285.61,282.12,...,0.0,0.3,0.35,0.0,100.0,75.0,55.62,2.5,90.0,37.95
4,1,5,Fortuna,-124.156034,40.597407,271.67,291.44,279.30,285.87,282.50,...,0.0,0.0,0.67,0.0,100.0,73.5,49.83,1.0,90.0,39.77
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183361,12,27,Progress,-121.801080,39.063227,265.64,292.42,271.57,280.66,274.16,...,0.0,0.0,0.19,0.0,92.0,31.5,42.43,1.0,90.0,41.84
183362,12,28,Progress,-121.801080,39.063227,260.97,287.58,272.70,280.75,276.65,...,0.0,0.0,0.33,0.0,97.0,68.0,48.03,1.0,90.0,40.63
183363,12,29,Progress,-121.801080,39.063227,260.98,289.62,272.62,279.45,275.92,...,0.0,0.0,0.90,0.0,97.0,84.0,62.04,22.0,90.0,35.96
183364,12,30,Progress,-121.801080,39.063227,262.63,283.24,272.23,277.14,275.14,...,0.0,0.0,0.17,0.0,100.0,75.0,56.03,20.0,90.0,36.48


In [None]:
# To CSV and download off Colab
from google.colab import files

df2.to_csv('weather_data.csv', index=False)
files.download('weather_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>