In [1]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from getpass import getpass
import numpy as np
from sklearn.metrics import r2_score
import warnings

In [2]:
# Supressing warnings with code found here:  https://stackoverflow.com/questions/48828824/disable-warnings-in-jupyter-notebook
warnings.filterwarnings('ignore')

In [3]:
# Loading the weather data from the databases
# Reading the data from our database
# Debugging with syntax from here:  https://stackoverflow.com/questions/23839656/sqlalchemy-no-password-supplied-error
password = getpass('Enter database password')
london_weather_df = pd.read_sql_table('london_weather_yyyy_mm', 
                                      f'postgresql://postgres:{password}@localhost/Final_Project_Travel')
nyc_weather_df = pd.read_sql_table('nyc_weather_yyyy_mm',
                                  f'postgresql://postgres:{password}@localhost/Final_Project_Travel')

Enter database password········


In [4]:
# Dropping the index columns
london_weather_df = london_weather_df.drop(columns=['index'], axis=1)
nyc_weather_df = nyc_weather_df.drop(columns=['index'], axis=1)

# Linear regression model avg high temp vs. year

In [5]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
high_temp_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        y = city_month_weather_df.avg_high_temp_f

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Adding the values to the list
        high_temp_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'high_temp',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'r2': r2
        })

# Creating a DataFrame from our results
high_temp_model_df = pd.DataFrame(high_temp_model_outputs)

# Adding the predictions to the high temp DataFrame
high_temp_model_df['2022_prediction_F'] = (2022 * high_temp_model_df['coef']) + high_temp_model_df['intercept']

In [6]:
# Displaying DataFrame to show our MSE metric
high_temp_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,r2,2022_prediction_F
0,london,1,high_temp,0.055595,-64.667518,2.511201,9.064916,0.198946,47.745771
1,london,2,high_temp,0.070913,-93.901723,2.8704,10.303177,-0.031886,49.485265
2,london,3,high_temp,0.068267,-83.651503,2.510225,11.540598,-0.071905,54.38461
3,london,4,high_temp,0.08439,-110.554051,1.907322,4.950743,0.301717,60.082036
4,london,5,high_temp,0.054758,-44.928559,2.009374,6.375997,0.234187,65.791692
5,london,6,high_temp,0.050247,-29.844378,2.305965,7.625064,-0.031636,71.755036
6,london,7,high_temp,0.071121,-67.847268,3.57261,18.53194,0.176176,75.959983
7,london,8,high_temp,0.064212,-55.22249,2.254411,8.481454,0.038528,74.613772
8,london,9,high_temp,0.034023,-0.31886,1.682671,4.04738,-0.073563,68.476023
9,london,10,high_temp,0.009538,40.869639,1.947352,6.156941,-0.123835,60.154595


In [7]:
# Saving as CSV file
high_temp_model_df.to_csv('../Resources/high_temp_model_with_r2.csv')

# Linear regression model total precip/rainfall vs. year

In [8]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
total_rainfall_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        if city == 'london':
            y = city_month_weather_df.total_rainfall_inches
        else:
            y = city_month_weather_df.avg_total_precipitation_inches

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Adding the values to the list
        total_rainfall_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'rainfall',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'r2': r2
        })
        
# Creating a DataFrame from our results
total_rainfall_model_df = pd.DataFrame(total_rainfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_rainfall_model_df['2022_prediction_inches'] = (2022 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']

In [9]:
# Displaying DataFrame to show our MSE metric
total_rainfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,r2,2022_prediction_inches
0,london,1,rainfall,0.012769,-23.07,0.945556,1.299402,-0.19466,2.748152
1,london,2,rainfall,0.010685,-19.639099,0.8156,0.930291,0.027168,1.966814
2,london,3,rainfall,-0.003241,8.106947,0.634483,0.591444,-0.015035,1.553366
3,london,4,rainfall,-0.004042,9.602574,0.796679,1.282317,-0.060409,1.429421
4,london,5,rainfall,0.006767,-11.486851,0.901333,1.134292,-0.165432,2.196136
5,london,6,rainfall,0.002068,-2.218262,1.111741,1.717365,-0.036882,1.963975
6,london,7,rainfall,-0.003373,8.51063,0.904318,1.082374,-0.041946,1.691105
7,london,8,rainfall,0.001732,-1.300431,0.857973,1.279114,-0.050495,2.202638
8,london,9,rainfall,-0.010682,23.28527,0.701964,0.790705,-0.050744,1.6862
9,london,10,rainfall,0.012965,-23.348448,1.326355,3.101383,-0.053433,2.867499


In [10]:
# Saving as CSV
total_rainfall_model_df.to_csv('../Resources/total_rainfall_model_with_r2.csv')

# Linear regression model NYC snowfall vs. Year

In [11]:
# Looping through the months for NYC, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_snowfall_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for NYC for each month
    city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.avg_total_snowfall_inches

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Adding the values to the list
    total_snowfall_model_outputs.append({
        'city': 'NYC',
        'month_num': i,
        'weather_factor': 'snowfall',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'r2': r2
    })

# Creating a DataFrame from our results
total_snowfall_model_df = pd.DataFrame(total_snowfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_snowfall_model_df['2022_prediction_inches'] = (2022 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']

In [12]:
# Displaying DataFrame to show our MSE metric
total_snowfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,r2,2022_prediction_inches
0,NYC,1,snowfall,0.048111,-88.689927,7.022397,75.773701,-0.146722,8.590144
1,NYC,2,snowfall,0.104866,-199.200691,7.420587,74.026763,-0.131289,12.838135
2,NYC,3,snowfall,-0.023835,51.450265,4.957437,42.241818,-0.068098,3.256561
3,NYC,4,snowfall,-0.003261,6.688058,1.189853,7.049914,-0.141285,0.093879
4,NYC,5,snowfall,-0.0,0.0,0.0,0.0,1.0,0.0
5,NYC,6,snowfall,-0.0,0.0,0.0,0.0,1.0,0.0
6,NYC,7,snowfall,-0.0,0.0,0.0,0.0,1.0,0.0
7,NYC,8,snowfall,-0.0,0.0,0.0,0.0,1.0,0.0
8,NYC,9,snowfall,0.0,0.0,0.0,0.0,1.0,0.0
9,NYC,10,snowfall,0.003829,-7.514607,0.138631,0.0236,0.0,0.227465


In [13]:
# Saving as CSV
total_snowfall_model_df.to_csv('../Resources/total_snowfall_model_with_r2.csv')

# Linear regression model London sunshine hours vs. Year

In [14]:
# Looping through the months for London, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_sunshine_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for London for each month
    city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.total_sunshine_duration_hours

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Adding the values to the list
    total_sunshine_model_outputs.append({
        'city': 'London',
        'month_num': i,
        'weather_factor': 'sunshine',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'r2': r2
    })

# Creating a DataFrame from our results
total_sunshine_model_df = pd.DataFrame(total_sunshine_model_outputs)

# Adding the predictions to the high temp DataFrame
total_sunshine_model_df['2022_prediction_hours'] = (2022 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']

In [15]:
# Displaying DataFrame to show our MSE metric
total_sunshine_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,r2,2022_prediction_hours
0,London,1,sunshine,0.193598,-331.284701,12.147622,241.626963,-0.107433,60.171431
1,London,2,sunshine,0.246556,-417.65541,17.933069,496.979841,0.034376,80.881445
2,London,3,sunshine,0.235475,-355.677778,31.877115,1439.606326,0.010355,120.452854
3,London,4,sunshine,0.798859,-1429.547694,23.297925,1013.926968,0.145636,185.745801
4,London,5,sunshine,0.010946,175.027661,36.264322,2046.500808,-0.039277,197.161412
5,London,6,sunshine,-0.477385,1150.506203,30.428182,1387.25838,0.016446,185.23331
6,London,7,sunshine,0.190744,-180.868937,40.166941,1975.555113,0.031555,204.816058
7,London,8,sunshine,0.188955,-191.413324,32.110273,1902.30742,-0.067964,190.65392
8,London,9,sunshine,0.360865,-569.888916,22.159481,664.248347,-0.27364,159.780269
9,London,10,sunshine,0.026334,56.048657,20.616834,669.439077,-0.004266,109.295063


In [16]:
# Saving as CSV
total_sunshine_model_df.to_csv('../Resources/total_sunshine_model_with_r2.csv')

# Combining models into prediction outputs

In [17]:
# Creating a DataFrame to hold the predicted ranges
predictions_df = high_temp_model_df[['city', 'month_num', '2022_prediction_F']]

In [18]:
# High Temp Predictions
# Adding columns to hold the prediction ranges
predictions_df['min_pred_2022_F'] = (predictions_df['2022_prediction_F'] - high_temp_model_df['mae'])
predictions_df['max_pred_2022_F'] = (predictions_df['2022_prediction_F'] + high_temp_model_df['mae'])

# Rounding the min predictions down and the max predictions up
predictions_df['min_pred_2022_F'] = np.floor(predictions_df['min_pred_2022_F'])
predictions_df['max_pred_2022_F'] = np.ceil(predictions_df['max_pred_2022_F'])

# Formatting the columns
predictions_df['min_pred_2022_F'] = predictions_df['min_pred_2022_F'].map('{:.0f}'.format)
predictions_df['max_pred_2022_F'] = predictions_df['max_pred_2022_F'].map('{:.0f}'.format)

# Concatenating into prediction columns
predictions_df['Predicted avg high temp (F)'] = (predictions_df['min_pred_2022_F'] + '-' 
                                                      + predictions_df['max_pred_2022_F'])

# Dropping the unneeded columns
predictions_df = predictions_df.drop(columns=['2022_prediction_F', 'min_pred_2022_F', 'max_pred_2022_F'], 
                                     axis=1)

In [19]:
# Total precip predictions

# Adding columns to hold the prediction ranges
predictions_df['min_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] - total_rainfall_model_df['mae'])
predictions_df['max_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] + total_rainfall_model_df['mae'])

# # Rounding the predictions
predictions_df['min_precip_2022'] = predictions_df['min_precip_2022'].round(1)
predictions_df['max_precip_2022'] = predictions_df['max_precip_2022'].round(1)

# # Concatenating into prediction columns
# Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
predictions_df['Predicted total rainfall (inches)'] = (predictions_df['min_precip_2022'].apply(str) + '-' 
                                                            + predictions_df['max_precip_2022'].apply(str))

# # Dropping the unneeded columns
predictions_df = predictions_df.drop(columns=['min_precip_2022', 'max_precip_2022'], axis=1)

In [20]:
# Separating prediction DF by city
london_predictions_df = predictions_df.loc[(predictions_df['city'] == 'london')]
nyc_predictions_df = predictions_df.loc[(predictions_df['city'] == 'nyc')]

In [21]:
# Debugging error creating nyc snowfall predictions with workaround
min_snow_2022 = total_snowfall_model_df['2022_prediction_inches'] - total_snowfall_model_df['mae']
max_snow_2022 = total_snowfall_model_df['2022_prediction_inches'] + total_snowfall_model_df['mae']

In [22]:
# Total snowfall

# Adding columns to hold the prediction ranges
nyc_predictions_df['min_snow_2022'] = min_snow_2022.tolist()
nyc_predictions_df['max_snow_2022'] = max_snow_2022.tolist()

# Replacing negative numbers in min snow columns with 0
# Adapting code to replace negative numbers with 0, found here:  https://stackoverflow.com/questions/49681363/replace-negative-values-in-single-dataframe-column
nyc_predictions_df['min_snow_2022'][nyc_predictions_df['min_snow_2022'] < 0] = 0

# Rounding the predictions to the nearest tenth
nyc_predictions_df['min_snow_2022'] = nyc_predictions_df['min_snow_2022'].round(1) 
nyc_predictions_df['max_snow_2022'] = nyc_predictions_df['max_snow_2022'].round(1)  

# Concatenating into prediction columns
# Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
nyc_predictions_df['Predicted total snowfall (inches)'] = (nyc_predictions_df['min_snow_2022'].apply(str) + '-'
                                                               + nyc_predictions_df['max_snow_2022'].apply(str))

# Dropping the unneeded columns
nyc_predictions_df = nyc_predictions_df.drop(columns=['min_snow_2022', 'max_snow_2022'])

In [23]:
# Creating the month_dict
month_dict = {1: 'Jan',
             2: 'Feb',
             3: 'Mar',
             4: 'Apr',
             5: 'May',
             6: 'Jun',
             7: 'Jul',
             8: 'Aug',
             9: 'Sep',
             10: 'Oct',
             11: 'Nov',
             12: 'Dec'}

In [24]:
# Adding the month name to the DataFrame
nyc_predictions_df['Month'] = nyc_predictions_df['month_num'].apply(lambda x:month_dict[x])

# Dropping the month num column
nyc_predictions_df = nyc_predictions_df.drop(columns=['month_num'], axis=1)

# Rearranging the columns
nyc_predictions_df = nyc_predictions_df[['Month', 'Predicted avg high temp (F)',
                                        'Predicted total rainfall (inches)', 
                                        'Predicted total snowfall (inches)']]

# Displaying the updated DataFrame
nyc_predictions_df

Unnamed: 0,Month,Predicted avg high temp (F),Predicted total rainfall (inches),Predicted total snowfall (inches)
12,Jan,35-43,1.9-5.1,1.6-15.6
13,Feb,38-46,2.6-4.7,5.4-20.3
14,Mar,48-55,3.6-6.7,0.0-8.2
15,Apr,60-67,3.5-6.5,0.0-1.3
16,May,69-75,3.2-6.5,0.0-0.0
17,Jun,78-83,2.6-7.0,0.0-0.0
18,Jul,82-88,2.8-7.3,0.0-0.0
19,Aug,82-86,3.8-7.7,0.0-0.0
20,Sep,74-79,2.6-7.5,0.0-0.0
21,Oct,62-67,3.9-7.3,0.1-0.4


In [25]:
# Total sunshine

# Adding the prediction ranges
london_predictions_df['min_sun_2022'] = (total_sunshine_model_df['2022_prediction_hours'] - total_sunshine_model_df['mae'])
london_predictions_df['max_sun_2022'] = (total_sunshine_model_df['2022_prediction_hours'] + total_sunshine_model_df['mae'])

# Rounding the min predictions down and the max predictions up
london_predictions_df['min_sun_2022'] = np.floor(london_predictions_df['min_sun_2022'])
london_predictions_df['max_sun_2022'] = np.ceil(london_predictions_df['max_sun_2022'])

# Formatting the columns
london_predictions_df['min_sun_2022'] = london_predictions_df['min_sun_2022'].map('{:.0f}'.format)
london_predictions_df['max_sun_2022'] = london_predictions_df['max_sun_2022'].map('{:.0f}'.format)

# Concatenating the predictions
london_predictions_df['Predicted total sunshine (hours)'] = (london_predictions_df['min_sun_2022'].apply(str) + '-' 
                                                            + london_predictions_df['max_sun_2022'].apply(str))

# Dropping the unneeded columns
london_predictions_df = london_predictions_df.drop(columns=['min_sun_2022', 'max_sun_2022'], axis=1)

In [26]:
# Adding month name to the DataFrame
london_predictions_df['Month'] = london_predictions_df['month_num'].apply(lambda x:month_dict[x])

# Dropping the unneeded columns
london_predictions_df.drop(columns=['city', 'month_num'], axis=1)

# Rearranging the columns
london_predictions_df = london_predictions_df[['Month', 'Predicted avg high temp (F)', 
                                             'Predicted total rainfall (inches)',
                                             'Predicted total sunshine (hours)']]

# Displaying the updated DataFrame
london_predictions_df

Unnamed: 0,Month,Predicted avg high temp (F),Predicted total rainfall (inches),Predicted total sunshine (hours)
0,Jan,45-51,1.8-3.7,48-73
1,Feb,46-53,1.2-2.8,62-99
2,Mar,51-57,0.9-2.2,88-153
3,Apr,58-62,0.6-2.2,162-210
4,May,63-68,1.3-3.1,160-234
5,Jun,69-75,0.9-3.1,154-216
6,Jul,72-80,0.8-2.6,164-245
7,Aug,72-77,1.3-3.1,158-223
8,Sep,66-71,1.0-2.4,137-182
9,Oct,58-63,1.5-4.2,88-130


In [27]:
# Saving the Predictions DataFrames as CSVs
#nyc_predictions_df.to_csv('Resources/nyc_predictions_final_with_r_squared.csv', index=False)
#london_predictions_df.to_csv('Resources/london_predictions_final_with_r_squared.csv', index=False)