In [3]:
%matplotlib notebook

#import main python libraries
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

In [4]:
#read in CSV created during data collection in the WeatherPY_DataGenerationRetrieval.ipynb
cities_df = pd.read_csv("data/cities_in_owm.csv", index_col=[0])
cities_df.head()

Unnamed: 0,CityID,Name,Country,Lat,Lng,Max Temp,Humidity,Cloudiness,Wind Speed (mph),Date_Time
0,4020109,Atuona,PF,-9.8,-139.03,80.02,73,29,14.65,1573883089
1,3534915,Trinidad,CU,21.81,-79.98,73.4,100,43,3.36,1573883089
2,1015776,Bredasdorp,ZA,-34.53,20.04,62.6,93,91,17.22,1573883089
3,3833367,Ushuaia,AR,-54.81,-68.31,42.8,87,40,5.82,1573883089
4,3896218,Castro,CL,-42.48,-73.76,50.0,93,75,9.17,1573883089


In [8]:
#identify bins for latitude bracketing 
max_lat = abs(cities_df["Lat"]).max()
min_lat = abs(cities_df["Lat"]).min()
bins = [0,9,19,29,39,49,59,69,max_lat]
label_names = ["<10","10-19","20-29","30-39","40-49","50-59","60-69","70+"]

#add lat bins to cities_df 
cities_df["Lat Bins"] = pd.cut(abs(cities_df["Lat"]), bins, labels=label_names)
cities_df.head()

Unnamed: 0,CityID,Name,Country,Lat,Lng,Max Temp,Humidity,Cloudiness,Wind Speed (mph),Date_Time,Lat Bins
0,4020109,Atuona,PF,-9.8,-139.03,80.02,73,29,14.65,1573883089,10-19
1,3534915,Trinidad,CU,21.81,-79.98,73.4,100,43,3.36,1573883089,20-29
2,1015776,Bredasdorp,ZA,-34.53,20.04,62.6,93,91,17.22,1573883089,30-39
3,3833367,Ushuaia,AR,-54.81,-68.31,42.8,87,40,5.82,1573883089,50-59
4,3896218,Castro,CL,-42.48,-73.76,50.0,93,75,9.17,1573883089,40-49


In [3]:
cities_in_df = len(cities_df)
cities_in_positive_lats = len(cities_df.where(cities_df["Lat"]>0).dropna())
cities_in_negative_lats = len(cities_df.where(cities_df["Lat"]<0).dropna())

print(f"Cities in dataset: {cities_in_df}")
print(f"Cities in Northern Hemisphere (positive latitudes): {cities_in_positive_lats}")
print(f"Cities in Southern Hemisphere (negative latitudes): {cities_in_negative_lats}")

Cities in dataset: 1244
Cities in Northern Hemisphere (positive latitudes): 881
Cities in Southern Hemisphere (negative latitudes): 354


In [4]:
#This function creates two plots for each set of variables passed into it: 
# 1. Scatter plot showing relationship between city latitude and a specific temperature-related variable
# 2. A second scatter plot containing the same information, 
#     but additionally showing regression lines and line-related statistics.
#     Even though regression isn't the appropriate framework for some of the plots, I thought it was 
#     cool to try and build an automated regression analysis into each one. 
#
# Arguments:
#          data = DF with three columns: "Lat", 
#                                        the column to be plotted against it, 
#                                        the unix timestamp from the moment of colleciton
#          yticks = yticks for current plots
#          y_label = ylabel for the current plots
#          plot_title = title for the current plot
#          filename = "save as" name used to export the current plot to a PNG

def plot_latitude_data(df, y_ticks, y_label, plot_title, filename):
    
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(9, 4))
 
    #get values from passed df 
    x_axis = df["Lat"]
    y_axis = df.iloc[:,1]
    unix_time = int(df["Date_Time"].mean())

    #set default values for different axes objects
    xlabel_fontsize = 8
    plot_bgcolor = "#DBE8F4"
    x_ticks = [-80,-60,-40,-20,0,20,40,60,80,100]
    m_size = 10
    m_color = "#F4DBDC"
    m_edgecolor="#51575B"
    saveas_filename = "plots/" + filename
    
    #get date value and convert to human readable format
    date_val = datetime.utcfromtimestamp(unix_time).strftime('%m-%d-%Y')
    
    #get residuals and create regression line for positive latitudes
    positivelats_df = df.where(df["Lat"]>0).dropna()
    positive_lats = positivelats_df["Lat"]
    positive_lats_maxtemp = positivelats_df.iloc[:,1]
    p_slope, p_int, p_r, p_p, p_std_err = stats.linregress(positive_lats, positive_lats_maxtemp)
    p_fit = p_slope * positive_lats + p_int
    
    #get residuals and create regression line for negative latitudes
    negativelats_df = df.where(df["Lat"]<0).dropna()
    negative_lats = negativelats_df["Lat"]
    negative_lats_maxtemp = negativelats_df.iloc[:,1]
    n_slope, n_int, n_r, n_p, n_std_err = stats.linregress( negative_lats, negative_lats_maxtemp)
    #
    n_fit = n_slope * negative_lats + n_int
        
    slope_properties_text = "Regression Line Statistics:" + "\n" \
                          + "--------------------------" + "\n" \
                          + "Positive Latitudes (black line)" + "\n" \
                          + "--------------------------" + "\n" \
                          + "Slope: " + str(p_slope) + "\n" \
                          + "Y Int: " + str(p_int) +  "\n" \
                          + "R: " + str(p_r) +  "\n" \
                          + "P: " + str(p_p) +  "\n" \
                          + "Std Err: " + str(p_std_err) + "\n" \
                          + "--------------------------" + "\n" \
                          + "Negative Latitudes (red line)" + "\n" \
                          + "--------------------------" + "\n" \
                          + "Slope: " + str(n_slope) + "\n" \
                          + "Y Int: " + str(n_int) +  "\n" \
                          + "R: " + str(n_r) +  "\n" \
                          + "P: " + str(n_p) +  "\n" \
                          + "Std Err: " + str(n_std_err) + "\n" \

    #setup line statistics text box that will be placed to the right of the second scatterplot
    text_box = dict(boxstyle='square', facecolor=plot_bgcolor, alpha=1)
    
    #set title of figure 
    fig.suptitle(f"{plot_title} ({date_val})")
 
    #setup first scatter plot 
    axes[0].scatter(x_axis, y_axis,s=m_size, c=m_color, edgecolors=m_edgecolor,alpha=1.0)

    axes[0].set_facecolor(plot_bgcolor)
    axes[0].set_xticks(x_ticks)
    axes[0].set_xlabel("Latitude", fontsize=xlabel_fontsize)
    axes[0].set_yticks(y_ticks)
    axes[0].set_ylabel(y_label)
    axes[0].grid(color='w', alpha=0.5)

    #setup second scatter plot with reduced alpha and regression line
    axes[1].scatter(x_axis, y_axis,s=m_size, c=m_color, edgecolors=m_edgecolor,alpha=0.25)
    axes[1].plot(positive_lats, p_fit, "k--", linewidth=2)
    axes[1].plot(negative_lats, n_fit, "r--", linewidth=2) 
    axes[1].set_facecolor(plot_bgcolor)
    axes[1].set_xticks(x_ticks)
    axes[1].set_xlabel("Latitude \n\n Marker alpha reduced, \n With regression lines", fontsize=xlabel_fontsize)
    axes[1].set_yticks(y_ticks)
    axes[1].set_yticklabels([])
    axes[1].grid(color='w', alpha=0.5) 
    axes[1].text(x = 1.05, y = .98, 
                 s = slope_properties_text, fontsize=7, 
                 transform=axes[1].transAxes,                  
                 bbox=text_box, va="top")
    

    plt.tight_layout(pad=3, w_pad=0.5, h_pad=0.5)
    plt.savefig(saveas_filename, dpi=300)


In [40]:
def plot_means(df, filename, ylabel):
    df.plot()
    plt.xlabel("Latitudes grouped in degress of 10")
    plt.ylabel(ylabel)
    plt.savefig(filename)

In [63]:
#get means of Max Temperature, broken into Latitude groups of 10 degrees
mean_df = cities_df.groupby(["Lat Bins"]).mean()[["Max Temp"]]
mean_df

Unnamed: 0_level_0,Max Temp
Lat Bins,Unnamed: 1_level_1
<10,79.36641
10-19,76.82545
20-29,74.270455
30-39,59.794155
40-49,41.394181
50-59,23.682244
60-69,17.898403
70+,13.0468


In [5]:
plot_latitude_data(df = cities_df[["Lat","Max Temp","Date_Time"]], 
                   y_ticks = np.arange(-100,200,50), 
                   y_label = "Max Temperature (F)",
                   plot_title = "City Latitude vs. Max Temperature",
                   filename ="latitude_vs_maxtemp.png")

<IPython.core.display.Javascript object>

In [62]:
#plot means
plot_means(df = max_temp_df, 
           ylabel="Mean of Max Temp",
           filename = "plots/max_temp_means.png")

<IPython.core.display.Javascript object>

In [55]:
humidity_df = cities_df.groupby(["Lat Bins"]).mean()[["Humidity"]]
humidity_df

Unnamed: 0_level_0,Humidity
Lat Bins,Unnamed: 1_level_1
<10,76.671795
10-19,65.306878
20-29,61.056818
30-39,61.072464
40-49,67.186441
50-59,79.782051
60-69,82.983193
70+,80.4


In [64]:
plot_means(df = humidity_df, 
           ylabel="Mean of Humidity",
           filename = "plots/humidity_means.png")

<IPython.core.display.Javascript object>

In [6]:
plot_latitude_data(df = cities_df[["Lat","Humidity","Date_Time"]], 
                   y_ticks = np.arange(-20,140,20), 
                   y_label = "Humidity (%)",
                   plot_title = "City Latitude vs. Humidity (%)",
                   filename ="latitude_vs_humidity.png")

<IPython.core.display.Javascript object>

In [65]:
cloudiness_df = cities_df.groupby(["Lat Bins"]).mean()[["Cloudiness"]]
cloudiness_df

Unnamed: 0_level_0,Cloudiness
Lat Bins,Unnamed: 1_level_1
<10,63.374359
10-19,55.306878
20-29,35.585227
30-39,40.550725
40-49,50.581921
50-59,67.967949
60-69,84.0
70+,57.4


In [66]:
plot_means(df = cloudiness_df, 
           ylabel="Mean of Cloudiness",
           filename = "plots/cloudiness_means.png")

<IPython.core.display.Javascript object>

In [7]:
plot_latitude_data(df = cities_df[["Lat","Cloudiness","Date_Time"]], 
                   y_ticks = np.arange(-20,140,20), 
                   y_label = "Cloudiness (%)",
                   plot_title = "City Latitude vs. Cloudiness (%)",
                   filename ="latitude_vs_cloudiness.png")

<IPython.core.display.Javascript object>

In [67]:
windspeed_df = cities_df.groupby(["Lat Bins"]).mean()[["Wind Speed (mph)"]]
windspeed_df

Unnamed: 0_level_0,Wind Speed (mph)
Lat Bins,Unnamed: 1_level_1
<10,5.920615
10-19,7.250688
20-29,8.20125
30-39,7.633671
40-49,10.012316
50-59,8.414295
60-69,10.560252
70+,7.0952


In [68]:
plot_means(df = windspeed_df, 
           ylabel="Mean of Wind Speed",
           filename = "plots/winddspeed_means.png")

<IPython.core.display.Javascript object>

In [6]:
plot_latitude_data(df = cities_df[["Lat","Wind Speed (mph)","Date_Time"]], 
                   y_ticks = np.arange(-5,45,5), 
                   y_label = "Wind Speed (mph)",
                   plot_title = "City Latitude vs. Wind Speed (mph)",
                   filename ="latitude_vs_windspeed.png")

<IPython.core.display.Javascript object>