In [1]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from getpass import getpass
import numpy as np


In [2]:
# Loading the weather data from the databases
# Reading the data from our database
# Debugging with syntax from here:  https://stackoverflow.com/questions/23839656/sqlalchemy-no-password-supplied-error
password = getpass('Enter database password')
london_weather_df = pd.read_sql_table('london_weather_yyyy_mm', 
                                      f'postgresql://postgres:{password}@localhost/Final_Project_Travel')
nyc_weather_df = pd.read_sql_table('nyc_weather_yyyy_mm',
                                  f'postgresql://postgres:{password}@localhost/Final_Project_Travel')

Enter database password········


In [3]:
# Looking at the London weather
london_weather_df

Unnamed: 0,index,year,month_num,avg_high_temp_f,avg_low_temp_f,total_rainfall_inches,days_of_air_frost,total_sunshine_duration_hours
0,108,1957,1,47.66,36.86,1.555118,5,53.0
1,109,1957,2,48.20,37.22,2.748031,5,64.9
2,110,1957,3,57.02,42.26,1.000000,2,96.7
3,111,1957,4,57.56,41.36,0.224409,1,169.6
4,112,1957,5,61.16,43.70,0.838583,0,195.0
...,...,...,...,...,...,...,...,...
771,879,2021,4,55.58,37.22,0.283465,5,202.6
772,880,2021,5,61.70,44.96,3.330709,0,131.9
773,881,2021,6,72.50,55.94,3.472441,0,159.6
774,882,2021,7,75.56,58.82,2.409449,0,171.1


In [4]:
# Looking at the NYC weather
nyc_weather_df

Unnamed: 0,index,year,month_num,avg_high_temp_f,avg_low_temp_f,avg_total_precipitation_inches,avg_total_snowfall_inches
0,0,1948,1,31.2,19.6,4.74,15.3
1,1,1948,2,37.8,23.6,2.52,13.6
2,2,1948,3,50.6,33.5,3.51,4.8
3,3,1948,4,58.9,43.1,3.26,0.0
4,4,1948,5,67.6,52.8,7.58,0.0
...,...,...,...,...,...,...,...
879,879,2021,4,63.7,45.5,2.69,0.0
880,880,2021,5,71.7,54.0,4.36,0.0
881,881,2021,6,82.5,66.0,2.62,0.0
882,882,2021,7,83.0,69.0,11.09,0.0


In [5]:
# Dropping the index columns
london_weather_df = london_weather_df.drop(columns=['index'], axis=1)
nyc_weather_df = nyc_weather_df.drop(columns=['index'], axis=1)

In [6]:
# Checking for null values
print(london_weather_df.isnull().sum())
print(nyc_weather_df.isnull().sum())

year                             0
month_num                        0
avg_high_temp_f                  0
avg_low_temp_f                   0
total_rainfall_inches            0
days_of_air_frost                0
total_sunshine_duration_hours    0
dtype: int64
year                              0
month_num                         0
avg_high_temp_f                   0
avg_low_temp_f                    0
avg_total_precipitation_inches    0
avg_total_snowfall_inches         0
dtype: int64


In [7]:
# Checking our datatypes
print(london_weather_df.dtypes)
print(nyc_weather_df.dtypes)

year                               int64
month_num                          int64
avg_high_temp_f                  float64
avg_low_temp_f                   float64
total_rainfall_inches            float64
days_of_air_frost                  int64
total_sunshine_duration_hours    float64
dtype: object
year                                int64
month_num                           int64
avg_high_temp_f                   float64
avg_low_temp_f                    float64
avg_total_precipitation_inches    float64
avg_total_snowfall_inches         float64
dtype: object


# Linear regression model avg high temp vs. year

In [8]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
high_temp_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        y = city_month_weather_df.avg_high_temp_f

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

        # Plotting and saving the predictions vs. the datapoints
        plt.scatter(X_train, y_train, color='blue', label='train')
        plt.scatter(X_test, y_test, color='green', label='test')
        plt.plot(X_test, y_pred, color='red', label='predict')
        plt.xlabel('year')
        plt.ylabel('avg high temp (F)')
        plt.legend()
        if city == 'london':
            plt.title('London ' + str(i) + ' avg high temps vs. model')
            plt.savefig('images/high_temp/london_'+ str(i) + '_avg_high_temps.png')
        else:
            plt.title('NYC ' + str(i) + ' avg high temps vs. model')
            plt.savefig('images/high_temp/nyc_'+ str(i) + '_avg_high_temps.png')
        # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
        plt.clf()

        # Viewing our coefficient and intercept
        #print(model.coef_)
        #print(model.intercept_)
        #print(f'{city} month = {i}')
        #print('---')

        # Adding the values to the list
        high_temp_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'high_temp',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'rmse': rmse
        })

# Creating a DataFrame from our results
high_temp_model_df = pd.DataFrame(high_temp_model_outputs)

# Adding the predictions to the high temp DataFrame
high_temp_model_df['2022_prediction_F'] = (2022 * high_temp_model_df['coef']) + high_temp_model_df['intercept']
high_temp_model_df['2023_prediction_F'] = (2023 * high_temp_model_df['coef']) + high_temp_model_df['intercept']

# # Formatting the columns
# high_temp_model_df['coef'] = high_temp_model_df['coef'].map('{:.2f}'.format)
# high_temp_model_df['intercept'] = high_temp_model_df['intercept'].map('{:.2f}'.format)
# high_temp_model_df['mae'] = high_temp_model_df['mae'].map('{:.2f}'.format)
# #high_temp_model_df['mse'] = high_temp_model_df['mse'].map('{:.2f}'.format)
# high_temp_model_df['rmse'] = high_temp_model_df['rmse'].map('{:.2f}'.format)
# high_temp_model_df['2022_prediction_F'] = high_temp_model_df['2022_prediction_F'].map('{:.2f}'.format)
# high_temp_model_df['2023_prediction_F'] = high_temp_model_df['2023_prediction_F'].map('{:.2f}'.format)

# Displaying the updated DataFrame
high_temp_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_F,2023_prediction_F
0,london,1,high_temp,0.078719,-110.645782,2.512167,10.347568,3.216764,48.523325,48.602043
1,london,2,high_temp,0.070187,-93.111515,2.983101,12.021301,3.467175,48.806905,48.877092
2,london,3,high_temp,0.040479,-28.91252,2.680401,9.647796,3.10609,52.935615,52.976094
3,london,4,high_temp,0.067774,-77.895542,2.295838,11.00335,3.31713,59.142864,59.210638
4,london,5,high_temp,0.061667,-58.53646,2.330613,7.378301,2.716303,66.153754,66.21542
5,london,6,high_temp,0.05407,-38.060673,3.522316,17.746072,4.212609,71.269258,71.323328
6,london,7,high_temp,0.074656,-75.219138,2.882128,13.843549,3.720692,75.735474,75.81013
7,london,8,high_temp,0.072898,-72.063103,2.419441,8.07404,2.841485,75.336449,75.409347
8,london,9,high_temp,0.035326,-3.024401,2.546457,8.896534,2.982706,68.403827,68.439152
9,london,10,high_temp,0.033376,-6.139362,1.854925,4.729352,2.174707,61.347483,61.380859


<Figure size 432x288 with 0 Axes>

# Linear regression model total precip/rainfall vs. year

In [9]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
total_rainfall_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        if city == 'london':
            y = city_month_weather_df.total_rainfall_inches
        else:
            y = city_month_weather_df.avg_total_precipitation_inches

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

        # Plotting and saving the predictions vs. the datapoints
        plt.scatter(X_train, y_train, color='blue', label='train')
        plt.scatter(X_test, y_test, color='green', label='test')
        plt.plot(X_test, y_pred, color='red', label='predict')
        plt.xlabel('year')
        plt.ylabel('total rainfall (inches)')
        plt.legend()
        if city == 'london':
            plt.title('London ' + str(i) + ' total rainfall vs. model')
            plt.savefig('images/rainfall/london_'+ str(i) + '_total_rainfall.png')
        else:
            plt.title('NYC ' + str(i) + ' total rainfall vs. model')
            plt.savefig('images/rainfall/nyc_'+ str(i) + '_total_rainfall.png')
        # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
        plt.clf()

        # Viewing our coefficient and intercept
        #print(model.coef_)
        #print(model.intercept_)
        #print(f'{city} month = {i}')
        #print('---')

        # Adding the values to the list
        total_rainfall_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'rainfall',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'rmse': rmse
        })
        
# Creating a DataFrame from our results
total_rainfall_model_df = pd.DataFrame(total_rainfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_rainfall_model_df['2022_prediction_inches'] = (2022 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']
total_rainfall_model_df['2023_prediction_inches'] = (2023 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']

# # Formatting the columns
# total_rainfall_model_df['coef'] = total_rainfall_model_df['coef'].map('{:.2f}'.format)
# total_rainfall_model_df['intercept'] = total_rainfall_model_df['intercept'].map('{:.2f}'.format)
# total_rainfall_model_df['mae'] = total_rainfall_model_df['mae'].map('{:.2f}'.format)
# #total_rainfall_model_df['mse'] = total_rainfall_model_df['mse'].map('{:.2f}'.format)
# total_rainfall_model_df['rmse'] = total_rainfall_model_df['rmse'].map('{:.2f}'.format)
# total_rainfall_model_df['2022_prediction_inches'] = total_rainfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
# total_rainfall_model_df['2023_prediction_inches'] = total_rainfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_rainfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_inches,2023_prediction_inches
0,london,1,rainfall,0.011198,-20.083539,1.130057,2.205978,1.485253,2.559426,2.570624
1,london,2,rainfall,0.015013,-28.109443,0.974048,1.131454,1.063698,2.24627,2.261283
2,london,3,rainfall,0.003578,-5.524264,0.69977,0.843726,0.918546,1.710361,1.713939
3,london,4,rainfall,-0.009558,20.662694,0.768128,0.800862,0.894909,1.336293,1.326735
4,london,5,rainfall,2e-06,1.954846,0.848353,0.918035,0.958141,1.957977,1.957978
5,london,6,rainfall,0.003628,-5.217171,1.162709,1.867116,1.366424,2.119307,2.122935
6,london,7,rainfall,-0.009103,19.967124,0.913954,1.19207,1.091819,1.560955,1.551852
7,london,8,rainfall,-0.002045,6.009389,0.950924,1.89335,1.37599,1.873881,1.871836
8,london,9,rainfall,-0.007318,16.314244,1.223875,2.55033,1.596975,1.517954,1.510637
9,london,10,rainfall,0.013024,-23.55814,1.630457,4.532011,2.128852,2.776592,2.789616


<Figure size 432x288 with 0 Axes>

# Linear regression model NYC snowfall vs. Year

In [10]:
# Looping through the months for NYC, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_snowfall_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for NYC for each month
    city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.avg_total_snowfall_inches

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    # Plotting and saving the predictions vs. the datapoints
    plt.scatter(X_train, y_train, color='blue', label='train')
    plt.scatter(X_test, y_test, color='green', label='test')
    plt.plot(X_test, y_pred, color='red', label='predict')
    plt.xlabel('year')
    plt.ylabel('total snowfall (inches)')
    plt.legend()
    plt.title('NYC ' + str(i) + ' total snowfall vs. model')
    plt.savefig('images/snowfall/nyc_'+ str(i) + '_total_snowfall.png')
    # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
    plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    total_snowfall_model_outputs.append({
        'city': 'NYC',
        'month_num': i,
        'weather_factor': 'snowfall',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
total_snowfall_model_df = pd.DataFrame(total_snowfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_snowfall_model_df['2022_prediction_inches'] = (2022 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']
total_snowfall_model_df['2023_prediction_inches'] = (2023 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']

# # Formatting the columns
# total_snowfall_model_df['coef'] = total_snowfall_model_df['coef'].map('{:.2f}'.format)
# total_snowfall_model_df['intercept'] = total_snowfall_model_df['intercept'].map('{:.2f}'.format)
# total_snowfall_model_df['mae'] = total_snowfall_model_df['mae'].map('{:.2f}'.format)
# #total_snowfall_model_df['mse'] = total_snowfall_model_df['mse'].map('{:.2f}'.format)
# total_snowfall_model_df['rmse'] = total_snowfall_model_df['rmse'].map('{:.2f}'.format)
# total_snowfall_model_df['2022_prediction_inches'] = total_snowfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
# total_snowfall_model_df['2023_prediction_inches'] = total_snowfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_snowfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_inches,2023_prediction_inches
0,NYC,1,snowfall,0.06412,-119.909356,5.586495,50.173509,7.083326,9.741007,9.805127
1,NYC,2,snowfall,0.084814,-159.342996,8.28199,89.102712,9.439423,12.151041,12.235855
2,NYC,3,snowfall,-0.024267,52.658439,4.190032,26.426875,5.140708,3.59034,3.566073
3,NYC,4,snowfall,-0.005299,10.803873,1.073365,6.260733,2.502146,0.089305,0.084006
4,NYC,5,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,NYC,6,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,NYC,7,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NYC,8,snowfall,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,NYC,9,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,NYC,10,snowfall,0.00289,-5.67047,0.076374,0.008934,0.094522,0.173431,0.176321


<Figure size 432x288 with 0 Axes>

# Linear regression model London sunshine hours vs. Year

In [11]:
# Looping through the months for London, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_sunshine_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for London for each month
    city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.total_sunshine_duration_hours

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    # Plotting and saving the predictions vs. the datapoints
    plt.scatter(X_train, y_train, color='blue', label='train')
    plt.scatter(X_test, y_test, color='green', label='test')
    plt.plot(X_test, y_pred, color='red', label='predict')
    plt.xlabel('year')
    plt.ylabel('total sunshine (hours)')
    plt.legend()
    plt.title('London ' + str(i) + ' total sunshine vs. model')
    plt.savefig('images/sunshine/london_'+ str(i) + '_total_sunshine.png')
    # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
    plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    total_sunshine_model_outputs.append({
        'city': 'London',
        'month_num': i,
        'weather_factor': 'sunshine',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
total_sunshine_model_df = pd.DataFrame(total_sunshine_model_outputs)

# Adding the predictions to the high temp DataFrame
total_sunshine_model_df['2022_prediction_hours'] = (2022 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']
total_sunshine_model_df['2023_prediction_hours'] = (2023 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']

# # Formatting the columns
# total_sunshine_model_df['coef'] = total_sunshine_model_df['coef'].map('{:.2f}'.format)
# total_sunshine_model_df['intercept'] = total_sunshine_model_df['intercept'].map('{:.2f}'.format)
# total_sunshine_model_df['mae'] = total_sunshine_model_df['mae'].map('{:.2f}'.format)
# #total_sunshine_model_df['mse'] = total_sunshine_model_df['mse'].map('{:.2f}'.format)
# total_sunshine_model_df['rmse'] = total_sunshine_model_df['rmse'].map('{:.2f}'.format)
# total_sunshine_model_df['2022_prediction_hours'] = total_sunshine_model_df['2022_prediction_hours'].map('{:.2f}'.format)
# total_sunshine_model_df['2023_prediction_hours'] = total_sunshine_model_df['2023_prediction_hours'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_sunshine_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_hours,2023_prediction_hours
0,London,1,sunshine,0.212028,-367.325158,14.390139,313.603399,17.708851,61.395574,61.607602
1,London,2,sunshine,0.453142,-831.620415,16.588596,373.476254,19.325534,84.631924,85.085065
2,London,3,sunshine,0.068619,-23.458831,29.116957,1263.52712,35.546127,115.288242,115.356861
3,London,4,sunshine,0.717026,-1270.04412,29.625189,1341.000449,36.619673,179.782563,180.499589
4,London,5,sunshine,0.270412,-341.290783,25.916445,781.356047,27.952747,205.481836,205.752248
5,London,6,sunshine,-0.218581,631.691522,41.876638,2456.244195,49.56051,189.721725,189.503144
6,London,7,sunshine,0.159906,-120.397123,42.055263,2119.622107,46.039354,202.932414,203.09232
7,London,8,sunshine,-0.036317,258.409459,35.421667,2003.001313,44.754903,184.97693,184.940613
8,London,9,sunshine,-0.021151,186.270961,20.54892,695.629137,26.374782,143.503742,143.482591
9,London,10,sunshine,-0.016142,141.497834,11.511464,237.082654,15.397489,108.85886,108.842718


<Figure size 432x288 with 0 Axes>

In [12]:
# One high_temp model for each city with all months

# Creating empty list to hold the outputs of our model
combined_high_temp_model_outputs = []

# Looping through each city
for city in cities:
    if city == 'london':
        city_month_weather_df = london_weather_df
    else:
        city_month_weather_df = nyc_weather_df
        
    # Preparing the data for Scikit-learn library
    X = city_month_weather_df[['year', 'month_num']]

    # Assigning the target variable
    y = city_month_weather_df.avg_high_temp_f

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#     # Plotting and saving the predictions vs. the datapoints
#     plt.scatter(X_train, y_train, color='blue', label='train')
#     plt.scatter(X_test, y_test, color='green', label='test')
#     plt.plot(X_test, y_pred, color='red', label='predict')
#     plt.xlabel('year')
#     plt.ylabel('avg high temp (F)')
#     plt.legend()
#     if city == 'london':
#         plt.title('London ' + str(i) + ' avg high temps vs. model')
#         plt.savefig('images/high_temp/london_'+ str(i) + '_avg_high_temps.png')
#     else:
#         plt.title('NYC ' + str(i) + ' avg high temps vs. model')
#         plt.savefig('images/high_temp/nyc_'+ str(i) + '_avg_high_temps.png')
#     # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
#     plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    combined_high_temp_model_outputs.append({
        'city': city,
        #'month_num': i,
        'weather_factor': 'high_temp',
        'coef_year': model.coef_[0],
        'coef_month_num': model.coef_[1],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
combined_high_temp_model_df = pd.DataFrame(combined_high_temp_model_outputs)

# Adding the predictions to the high temp DataFrame
#combined_high_temp_model_df['2022_prediction_F'] = (2022 * combined_high_temp_model_df['coef_year']) + (combined_high_temp_model_df['month_num'] * combined_high_temp_model_df['coef_month_num']) + combined_high_temp_model_df['intercept']
#combined_high_temp_model_df['2023_prediction_F'] = (2023 * combined_high_temp_model_df['coef_year']) + (combined_high_temp_model_df['month_num'] * combined_high_temp_model_df['coef_month_num']) + combined_high_temp_model_df['intercept']

# Formatting the columns
combined_high_temp_model_df['coef_year'] = combined_high_temp_model_df['coef_year'].map('{:.2f}'.format)
combined_high_temp_model_df['coef_month_num'] = combined_high_temp_model_df['coef_month_num'].map('{:.2f}'.format)                                                                                                        
combined_high_temp_model_df['intercept'] = combined_high_temp_model_df['intercept'].map('{:.2f}'.format)
combined_high_temp_model_df['mae'] = combined_high_temp_model_df['mae'].map('{:.2f}'.format)
combined_high_temp_model_df['mse'] = combined_high_temp_model_df['mse'].map('{:.2f}'.format)
combined_high_temp_model_df['rmse'] = combined_high_temp_model_df['rmse'].map('{:.2f}'.format)
#combined_high_temp_model_df['2022_prediction_F'] = combined_high_temp_model_df['2022_prediction_F'].map('{:.2f}'.format)
#combined_high_temp_model_df['2023_prediction_F'] = combined_high_temp_model_df['2023_prediction_F'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_high_temp_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,london,high_temp,0.06,0.68,-60.43,9.06,106.91,10.34
1,nyc,high_temp,0.01,1.41,31.91,13.73,250.92,15.84


In [13]:
# One rainfall model for each city for all months
# Looks like this model performs slightly better or about the same overall vs. the model above where rainfall data is broken out by months

# Creating empty list to hold the outputs of our model
combined_total_rainfall_model_outputs = []

# Looping through each city
for city in cities:
    # Splitting out the weather for that city
    if city == 'london':
        city_month_weather_df = london_weather_df
    else:
        city_month_weather_df = nyc_weather_df

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df[['year', 'month_num']]

    # Assigning the target variable
    if city == 'london':
        y = city_month_weather_df.total_rainfall_inches
    else:
        y = city_month_weather_df.avg_total_precipitation_inches

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#     # Plotting and saving the predictions vs. the datapoints
#     plt.scatter(X_train, y_train, color='blue', label='train')
#     plt.scatter(X_test, y_test, color='green', label='test')
#     plt.plot(X_test, y_pred, color='red', label='predict')
#     plt.xlabel('year')
#     plt.ylabel('total rainfall (inches)')
#     plt.legend()
#     if city == 'london':
#         plt.title('London ' + str(i) + ' total rainfall vs. model')
#         plt.savefig('images/rainfall/london_'+ str(i) + '_total_rainfall.png')
#     else:
#         plt.title('NYC ' + str(i) + ' total rainfall vs. model')
#         plt.savefig('images/rainfall/nyc_'+ str(i) + '_total_rainfall.png')
#     # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
#     plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    combined_total_rainfall_model_outputs.append({
        'city': city,
        #'month_num': i,
        'weather_factor': 'rainfall',
        'coef_year': model.coef_[0],
        'coef_month_num': model.coef_[1],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })
        
# Creating a DataFrame from our results
combined_total_rainfall_model_df = pd.DataFrame(combined_total_rainfall_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_rainfall_model_df['2022_prediction_inches'] = (2022 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']
#total_rainfall_model_df['2023_prediction_inches'] = (2023 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']

# Formatting the columns
combined_total_rainfall_model_df['coef_year'] = combined_total_rainfall_model_df['coef_year'].map('{:.2f}'.format)
combined_total_rainfall_model_df['coef_month_num'] = combined_total_rainfall_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_rainfall_model_df['intercept'] = combined_total_rainfall_model_df['intercept'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['mae'] = combined_total_rainfall_model_df['mae'].map('{:.2f}'.format)
combined_total_rainfall_model_df['mse'] = combined_total_rainfall_model_df['mse'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['rmse'] = combined_total_rainfall_model_df['rmse'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['2022_prediction_inches'] = combined_total_rainfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['2023_prediction_inches'] = combined_total_rainfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_rainfall_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,london,rainfall,0.0,0.05,-2.47,0.933314,1.37,1.169198
1,nyc,rainfall,0.02,0.04,-28.06,1.706981,5.02,2.239992


In [14]:
# One model for all months snowfall NYC
# Model above with one for each month performs better

# Creating empty list to hold the outputs of our model
combined_total_snowfall_model_outputs = []

# Splitting out the weather for NYC for each month
city_month_weather_df = nyc_weather_df

# Preparing the data for Scikit-learn library
X = city_month_weather_df[['year', 'month_num']]

# Assigning the target variable
y = city_month_weather_df.avg_total_snowfall_inches

# Creating the model from the class
model = LinearRegression()

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training the model
model.fit(X_train, y_train)

# Generating the predictions
y_pred = model.predict(X_test)

# Evaluating the performance
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# # Plotting and saving the predictions vs. the datapoints
# plt.scatter(X_train, y_train, color='blue', label='train')
# plt.scatter(X_test, y_test, color='green', label='test')
# plt.plot(X_test, y_pred, color='red', label='predict')
# plt.xlabel('year')
# plt.ylabel('total snowfall (inches)')
# plt.legend()
# plt.title('NYC ' + str(i) + ' total snowfall vs. model')
# plt.savefig('images/snowfall/nyc_'+ str(i) + '_total_snowfall.png')
# # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
# plt.clf()

# Viewing our coefficient and intercept
#print(model.coef_)
#print(model.intercept_)
#print(f'{city} month = {i}')
#print('---')

# Adding the values to the list
combined_total_snowfall_model_outputs.append({
    'city': 'NYC',
    #'month_num': i,
    'weather_factor': 'snowfall',
    'coef_year': model.coef_[0],
    'coef_month_num': model.coef_[1],
    'intercept': model.intercept_,
    'mae': mae,
    'mse': mse,
    'rmse': rmse
})

# Creating a DataFrame from our results
combined_total_snowfall_model_df = pd.DataFrame(combined_total_snowfall_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_snowfall_model_df['2022_prediction_inches'] = (2022 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']
#total_snowfall_model_df['2023_prediction_inches'] = (2023 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']

# Formatting the columns
combined_total_snowfall_model_df['coef_year'] = combined_total_snowfall_model_df['coef_year'].map('{:.2f}'.format)
combined_total_snowfall_model_df['coef_month_num'] = combined_total_snowfall_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_snowfall_model_df['intercept'] = combined_total_snowfall_model_df['intercept'].map('{:.2f}'.format)
combined_total_snowfall_model_df['mae'] = combined_total_snowfall_model_df['mae'].map('{:.2f}'.format)
combined_total_snowfall_model_df['mse'] = combined_total_snowfall_model_df['mse'].map('{:.2f}'.format)
combined_total_snowfall_model_df['rmse'] = combined_total_snowfall_model_df['rmse'].map('{:.2f}'.format)
#combined_total_snowfall_model_df['2022_prediction_inches'] = combined_total_snowfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_snowfall_model_df['2023_prediction_inches'] = combined_total_snowfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_snowfall_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,NYC,snowfall,0.01,-0.51,-19.03,2.89,18.19,4.26


In [15]:
# One model for London sunshine data for all months
# Model above broken out by months performs better than combined model

# Creating empty list to hold the outputs of our model
combined_total_sunshine_model_outputs = []

city_month_weather_df = london_weather_df

# Preparing the data for Scikit-learn library
X = city_month_weather_df[['year', 'month_num']]

# Assigning the target variable
y = city_month_weather_df.total_sunshine_duration_hours

# Creating the model from the class
model = LinearRegression()

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training the model
model.fit(X_train, y_train)

# Generating the predictions
y_pred = model.predict(X_test)

# Evaluating the performance
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# # Plotting and saving the predictions vs. the datapoints
# plt.scatter(X_train, y_train, color='blue', label='train')
# plt.scatter(X_test, y_test, color='green', label='test')
# plt.plot(X_test, y_pred, color='red', label='predict')
# plt.xlabel('year')
# plt.ylabel('total sunshine (hours)')
# plt.legend()
# plt.title('London ' + str(i) + ' total sunshine vs. model')
# plt.savefig('images/sunshine/london_'+ str(i) + '_total_sunshine.png')
# # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
# plt.clf()

# Viewing our coefficient and intercept
#print(model.coef_)
#print(model.intercept_)
#print(f'{city} month = {i}')
#print('---')

# Adding the values to the list
combined_total_sunshine_model_outputs.append({
    'city': 'London',
    #'month_num': i,
    'weather_factor': 'sunshine',
    'coef_year': model.coef_[0],
    'coef_month_num': model.coef_[1],
    'intercept': model.intercept_,
    'mae': mae,
    'mse': mse,
    'rmse': rmse
})

# Creating a DataFrame from our results
combined_total_sunshine_model_df = pd.DataFrame(combined_total_sunshine_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_sunshine_model_df['2022_prediction_inches'] = (2022 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']
#total_sunshine_model_df['2023_prediction_inches'] = (2023 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']

# Formatting the columns
combined_total_sunshine_model_df['coef_year'] = combined_total_sunshine_model_df['coef_year'].map('{:.2f}'.format)
combined_total_sunshine_model_df['coef_month_num'] = combined_total_sunshine_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_sunshine_model_df['intercept'] = combined_total_sunshine_model_df['intercept'].map('{:.2f}'.format)
combined_total_sunshine_model_df['mae'] = combined_total_sunshine_model_df['mae'].map('{:.2f}'.format)
combined_total_sunshine_model_df['mse'] = combined_total_sunshine_model_df['mse'].map('{:.2f}'.format)
combined_total_sunshine_model_df['rmse'] = combined_total_sunshine_model_df['rmse'].map('{:.2f}'.format)
#combined_total_sunshine_model_df['2022_prediction_inches'] = combined_total_sunshine_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_sunshine_model_df['2023_prediction_inches'] = combined_total_sunshine_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_sunshine_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,London,sunshine,0.18,-0.56,-230.98,59.99,4684.32,68.44


In [16]:
# Calculating mean MSE from sunshine model above
total_sunshine_model_df.groupby(['city']).mean()['mse']

city
London    1011.513498
Name: mse, dtype: float64

In [17]:
# Calculating mean MSE from high temp model above
high_temp_model_df.groupby(['city']).mean()['mse']

city
london     9.410103
nyc       11.023716
Name: mse, dtype: float64

In [18]:
# Calculating mean MSE from total precip model above
total_rainfall_model_df.groupby(['city']).mean()['mse']

city
london    1.748637
nyc       5.813902
Name: mse, dtype: float64

In [19]:
# Calculating mean MSE from total snowfall model
total_snowfall_model_df.groupby(['city']).mean()['mse']

city
NYC    17.966214
Name: mse, dtype: float64

In [20]:
# Creating a DataFrame to hold the predicted ranges
predictions_df = high_temp_model_df[['city', 'month_num', '2022_prediction_F', '2023_prediction_F']]
predictions_df

Unnamed: 0,city,month_num,2022_prediction_F,2023_prediction_F
0,london,1,48.523325,48.602043
1,london,2,48.806905,48.877092
2,london,3,52.935615,52.976094
3,london,4,59.142864,59.210638
4,london,5,66.153754,66.21542
5,london,6,71.269258,71.323328
6,london,7,75.735474,75.81013
7,london,8,75.336449,75.409347
8,london,9,68.403827,68.439152
9,london,10,61.347483,61.380859


In [21]:
high_temp_model_df.columns

Index(['city', 'month_num', 'weather_factor', 'coef', 'intercept', 'mae',
       'mse', 'rmse', '2022_prediction_F', '2023_prediction_F'],
      dtype='object')

In [22]:
# High Temp Predictions
# Adding columns to hold the prediction ranges
predictions_df['min_pred_2022_F'] = (predictions_df['2022_prediction_F'] - high_temp_model_df['mae'])
predictions_df['max_pred_2022_F'] = (predictions_df['2022_prediction_F'] + high_temp_model_df['mae'])
predictions_df['min_pred_2023_F'] = (predictions_df['2023_prediction_F'] - high_temp_model_df['mae'])
predictions_df['max_pred_2023_F'] = (predictions_df['2023_prediction_F'] + high_temp_model_df['mae'])

# Rounding the min predictions down and the max predictions up
predictions_df['min_pred_2022_F'] = np.floor(predictions_df['min_pred_2022_F'])
predictions_df['min_pred_2023_F'] = np.floor(predictions_df['min_pred_2023_F'])
predictions_df['max_pred_2023_F'] = np.ceil(predictions_df['max_pred_2023_F'])
predictions_df['max_pred_2022_F'] = np.ceil(predictions_df['max_pred_2022_F'])

# Formatting the columns
predictions_df['min_pred_2022_F'] = predictions_df['min_pred_2022_F'].map('{:.0f}'.format)
predictions_df['min_pred_2023_F'] = predictions_df['min_pred_2023_F'].map('{:.0f}'.format)
predictions_df['max_pred_2022_F'] = predictions_df['max_pred_2022_F'].map('{:.0f}'.format)
predictions_df['max_pred_2023_F'] = predictions_df['max_pred_2023_F'].map('{:.0f}'.format)

# Concatenating into prediction columns
predictions_df['Predicted avg high temp 2022 (F)'] = (predictions_df['min_pred_2022_F'] + '-' 
                                                      + predictions_df['max_pred_2022_F'])
predictions_df['Predicted avg high temp 2023 (F)'] = (predictions_df['min_pred_2023_F'] + '-' 
                                                      + predictions_df['max_pred_2023_F'])

# Dropping the unneeded columns
predictions_df = predictions_df.drop(columns=['2022_prediction_F', '2023_prediction_F', 'min_pred_2022_F', 'max_pred_2022_F',
                            'min_pred_2023_F', 'max_pred_2023_F'], axis=1)

predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F)
0,london,1,46-52,46-52
1,london,2,45-52,45-52
2,london,3,50-56,50-56
3,london,4,56-62,56-62
4,london,5,63-69,63-69
5,london,6,67-75,67-75
6,london,7,72-79,72-79
7,london,8,72-78,72-78
8,london,9,65-71,65-71
9,london,10,59-64,59-64


In [23]:
# Total precip predictions

# Adding columns to hold the prediction ranges
predictions_df['min_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] - total_rainfall_model_df['mae'])
predictions_df['max_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] + total_rainfall_model_df['mae'])
predictions_df['min_precip_2023'] = (total_rainfall_model_df['2023_prediction_inches'] - total_rainfall_model_df['mae'])
predictions_df['max_precip_2023'] = (total_rainfall_model_df['2023_prediction_inches'] + total_rainfall_model_df['mae'])

# # Rounding the predictions
predictions_df['min_precip_2022'] = predictions_df['min_precip_2022'].round(1)
predictions_df['max_precip_2022'] = predictions_df['max_precip_2022'].round(1)
predictions_df['min_precip_2023'] = predictions_df['min_precip_2023'].round(1)
predictions_df['max_precip_2023'] = predictions_df['max_precip_2023'].round(1)

# # Concatenating into prediction columns
# Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
predictions_df['Predicted total rainfall 2022 (inches)'] = (predictions_df['min_precip_2022'].apply(str) + '-' 
                                                            + predictions_df['max_precip_2022'].apply(str))
predictions_df['Predicted total rainfall 2023 (inches)'] = (predictions_df['min_precip_2023'].apply(str) + '-' 
                                                            + predictions_df['max_precip_2023'].apply(str))

# # Dropping the unneeded columns
predictions_df = predictions_df.drop(columns=['min_precip_2022', 'max_precip_2022', 
                                              'min_precip_2023', 'max_precip_2023'], axis=1)

predictions_df

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F),Predicted total rainfall 2022 (inches),Predicted total rainfall 2023 (inches)
0,london,1,46-52,46-52,1.4-3.7,1.4-3.7
1,london,2,45-52,45-52,1.3-3.2,1.3-3.2
2,london,3,50-56,50-56,1.0-2.4,1.0-2.4
3,london,4,56-62,56-62,0.6-2.1,0.6-2.1
4,london,5,63-69,63-69,1.1-2.8,1.1-2.8
5,london,6,67-75,67-75,1.0-3.3,1.0-3.3
6,london,7,72-79,72-79,0.6-2.5,0.6-2.5
7,london,8,72-78,72-78,0.9-2.8,0.9-2.8
8,london,9,65-71,65-71,0.3-2.7,0.3-2.7
9,london,10,59-64,59-64,1.1-4.4,1.2-4.4


In [24]:
# Separating prediction DF by city
london_predictions_df = predictions_df.loc[(predictions_df['city'] == 'london')]
nyc_predictions_df = predictions_df.loc[(predictions_df['city'] == 'nyc')]

In [25]:
# Debugging error creating nyc snowfall predictions with workaround
min_snow_2022 = total_snowfall_model_df['2022_prediction_inches'] - total_snowfall_model_df['mae']
max_snow_2022 = total_snowfall_model_df['2022_prediction_inches'] + total_snowfall_model_df['mae']
min_snow_2023 = total_snowfall_model_df['2023_prediction_inches'] - total_snowfall_model_df['mae']
max_snow_2023 = total_snowfall_model_df['2023_prediction_inches'] + total_snowfall_model_df['mae']

In [26]:
# Total snowfall

nyc_predictions_df['min_snow_2022'] = min_snow_2022
nyc_predictions_df['max_snow_2022'] = max_snow_2022
nyc_predictions_df['min_snow_2023'] = min_snow_2023
nyc_predictions_df['max_snow_2023'] = max_snow_2023

nyc_predictions_df




# # Adding columns to hold the prediction ranges
# predictions_df['min_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] - total_rainfall_model_df['mae'])
# predictions_df['max_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] + total_rainfall_model_df['mae'])
# predictions_df['min_precip_2023'] = (total_rainfall_model_df['2023_prediction_inches'] - total_rainfall_model_df['mae'])
# predictions_df['max_precip_2023'] = (total_rainfall_model_df['2023_prediction_inches'] + total_rainfall_model_df['mae'])

# # # Rounding the min predictions down and the max predictions up
# predictions_df['min_precip_2022'] = predictions_df['min_precip_2022'].round(1)
# predictions_df['max_precip_2022'] = predictions_df['max_precip_2022'].round(1)
# predictions_df['min_precip_2023'] = predictions_df['min_precip_2023'].round(1)
# predictions_df['max_precip_2023'] = predictions_df['max_precip_2023'].round(1)

# # # Concatenating into prediction columns
# # Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
# predictions_df['Predicted total rainfall 2022 (inches)'] = (predictions_df['min_precip_2022'].apply(str) + '-' 
#                                                             + predictions_df['max_precip_2022'].apply(str))
# predictions_df['Predicted total rainfall 2023 (inches)'] = (predictions_df['min_precip_2023'].apply(str) + '-' 
#                                                             + predictions_df['max_precip_2023'].apply(str))

# # # Dropping the unneeded columns
# predictions_df = predictions_df.drop(columns=['min_precip_2022', 'max_precip_2022', 
#                                               'min_precip_2023', 'max_precip_2023'], axis=1)

# predictions_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F),Predicted total rainfall 2022 (inches),Predicted total rainfall 2023 (inches),min_snow_2022,max_snow_2022,min_snow_2023,max_snow_2023
12,nyc,1,35-44,35-44,2.8-5.4,2.8-5.4,,,,
13,nyc,2,39-49,40-49,2.4-4.2,2.4-4.2,,,,
14,nyc,3,47-54,47-54,3.0-5.6,3.0-5.6,,,,
15,nyc,4,59-65,60-65,2.1-6.3,2.1-6.3,,,,
16,nyc,5,69-75,69-75,2.8-6.1,2.8-6.1,,,,
17,nyc,6,78-82,78-82,3.0-6.8,3.1-6.9,,,,
18,nyc,7,82-87,82-87,4.1-7.0,4.1-7.0,,,,
19,nyc,8,81-86,81-86,3.5-8.0,3.5-8.1,,,,
20,nyc,9,75-79,75-79,2.2-6.9,2.2-6.9,,,,
21,nyc,10,61-67,61-67,2.6-6.6,2.7-6.6,,,,


In [27]:
nyc_predictions_df.dtypes

city                                       object
month_num                                   int64
Predicted avg high temp 2022 (F)           object
Predicted avg high temp 2023 (F)           object
Predicted total rainfall 2022 (inches)     object
Predicted total rainfall 2023 (inches)     object
min_snow_2022                             float64
max_snow_2022                             float64
min_snow_2023                             float64
max_snow_2023                             float64
dtype: object

In [28]:
# Total sunshine

# Adding the prediction ranges
london_predictions_df['min_sun_2022'] = (total_sunshine_model_df['2022_prediction_hours'] - total_sunshine_model_df['mae'])
london_predictions_df['max_sun_2022'] = (total_sunshine_model_df['2022_prediction_hours'] + total_sunshine_model_df['mae'])
london_predictions_df['min_sun_2023'] = (total_sunshine_model_df['2023_prediction_hours'] - total_sunshine_model_df['mae'])
london_predictions_df['max_sun_2023'] = (total_sunshine_model_df['2023_prediction_hours'] + total_sunshine_model_df['mae'])

# Rounding the min predictions down and the max predictions up
london_predictions_df['min_sun_2022'] = np.floor(london_predictions_df['min_sun_2022'])
london_predictions_df['max_sun_2022'] = np.ceil(london_predictions_df['max_sun_2022'])
london_predictions_df['min_sun_2023'] = np.floor(london_predictions_df['min_sun_2023'])
london_predictions_df['max_sun_2023'] = np.ceil(london_predictions_df['max_sun_2023'])

# Formatting the columns
london_predictions_df['min_sun_2022'] = london_predictions_df['min_sun_2022'].map('{:.0f}'.format)
london_predictions_df['max_sun_2022'] = london_predictions_df['max_sun_2022'].map('{:.0f}'.format)
london_predictions_df['min_sun_2023'] = london_predictions_df['min_sun_2023'].map('{:.0f}'.format)
london_predictions_df['max_sun_2023'] = london_predictions_df['max_sun_2023'].map('{:.0f}'.format)

# Concatenating the predictions
london_predictions_df['Predicted total sunshine 2022 (hours)'] = (london_predictions_df['min_sun_2022'].apply(str) + '-' 
                                                            + london_predictions_df['max_sun_2022'].apply(str))
london_predictions_df['Predicted total sunshine 2023 (hours)'] = (london_predictions_df['min_sun_2023'].apply(str) + '-' 
                                                            + london_predictions_df['max_sun_2023'].apply(str))

# Dropping the unneeded columns
london_predictions_df = london_predictions_df.drop(columns=['min_sun_2022', 'max_sun_2022', 'min_sun_2023', 'max_sun_2023'])

london_predictions_df

# # # Rounding the min predictions down and the max predictions up
# predictions_df['min_precip_2022'] = predictions_df['min_precip_2022'].round(1)
# predictions_df['max_precip_2022'] = predictions_df['max_precip_2022'].round(1)
# predictions_df['min_precip_2023'] = predictions_df['min_precip_2023'].round(1)
# predictions_df['max_precip_2023'] = predictions_df['max_precip_2023'].round(1)

# # # Concatenating into prediction columns
# # Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
# predictions_df['Predicted total rainfall 2022 (inches)'] = (predictions_df['min_precip_2022'].apply(str) + '-' 
#                                                             + predictions_df['max_precip_2022'].apply(str))
# predictions_df['Predicted total rainfall 2023 (inches)'] = (predictions_df['min_precip_2023'].apply(str) + '-' 
#                                                             + predictions_df['max_precip_2023'].apply(str))

# # # Dropping the unneeded columns
# predictions_df = predictions_df.drop(columns=['min_precip_2022', 'max_precip_2022', 
#                                               'min_precip_2023', 'max_precip_2023'], axis=1)

# predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F),Predicted total rainfall 2022 (inches),Predicted total rainfall 2023 (inches),Predicted total sunshine 2022 (hours),Predicted total sunshine 2023 (hours)
0,london,1,46-52,46-52,1.4-3.7,1.4-3.7,47-76,47-76
1,london,2,45-52,45-52,1.3-3.2,1.3-3.2,68-102,68-102
2,london,3,50-56,50-56,1.0-2.4,1.0-2.4,86-145,86-145
3,london,4,56-62,56-62,0.6-2.1,0.6-2.1,150-210,150-211
4,london,5,63-69,63-69,1.1-2.8,1.1-2.8,179-232,179-232
5,london,6,67-75,67-75,1.0-3.3,1.0-3.3,147-232,147-232
6,london,7,72-79,72-79,0.6-2.5,0.6-2.5,160-245,161-246
7,london,8,72-78,72-78,0.9-2.8,0.9-2.8,149-221,149-221
8,london,9,65-71,65-71,0.3-2.7,0.3-2.7,122-165,122-165
9,london,10,59-64,59-64,1.1-4.4,1.2-4.4,97-121,97-121


In [29]:
# Dropping the London 2023 columns since they're nearly identical to the 2022 columns
london_predictions_df = london_predictions_df.drop(columns=['Predicted avg high temp 2023 (F)', 
                                                            'Predicted total rainfall 2023 (inches)',
                                                            'Predicted total sunshine 2023 (hours)'])

london_predictions_df

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted total rainfall 2022 (inches),Predicted total sunshine 2022 (hours)
0,london,1,46-52,1.4-3.7,47-76
1,london,2,45-52,1.3-3.2,68-102
2,london,3,50-56,1.0-2.4,86-145
3,london,4,56-62,0.6-2.1,150-210
4,london,5,63-69,1.1-2.8,179-232
5,london,6,67-75,1.0-3.3,147-232
6,london,7,72-79,0.6-2.5,160-245
7,london,8,72-78,0.9-2.8,149-221
8,london,9,65-71,0.3-2.7,122-165
9,london,10,59-64,1.1-4.4,97-121


In [30]:
# Renaming the columns
london_predictions_df.rename(columns={'Predicted avg high temp 2022 (F)': 'Predicted avg high temp 2022-23 (F)',
                                     'Predicted total rainfall 2022 (inches)': 'Predicted total rainfall 2022-23 (inches)',
                                     'Predicted total sunshine 2022 (hours)': 'Predicted total rainfall 2022-23 (hours)'})

london_predictions_df

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted total rainfall 2022 (inches),Predicted total sunshine 2022 (hours)
0,london,1,46-52,1.4-3.7,47-76
1,london,2,45-52,1.3-3.2,68-102
2,london,3,50-56,1.0-2.4,86-145
3,london,4,56-62,0.6-2.1,150-210
4,london,5,63-69,1.1-2.8,179-232
5,london,6,67-75,1.0-3.3,147-232
6,london,7,72-79,0.6-2.5,160-245
7,london,8,72-78,0.9-2.8,149-221
8,london,9,65-71,0.3-2.7,122-165
9,london,10,59-64,1.1-4.4,97-121


In [31]:
# Creating the month_dict
month_dict = {1: 'Jan',
             2: 'Feb',
             3: 'Mar',
             4: 'Apr',
             5: 'May',
             6: 'Jun',
             7: 'Jul',
             8: 'Aug',
             9: 'Sep',
             10: 'Oct',
             11: 'Nov',
             12: 'Dec'}

In [33]:
# Adding month name to the DataFrame
london_predictions_df['Month'] = london_predictions_df['month_num'].apply(lambda x:month_dict[x])

# Dropping the unneeded columns
london_predictions_df.drop(columns=['city', 'month_num'], axis=1)

# Rearranging the columns
london_predictions_df = london_predictions_df[['Month', 'Predicted avg high temp 2022 (F)', 
                                             'Predicted total rainfall 2022 (inches)',
                                             'Predicted total sunshine 2022 (hours)']]
                                             
london_predictions_df

Unnamed: 0,Month,Predicted avg high temp 2022 (F),Predicted total rainfall 2022 (inches),Predicted total sunshine 2022 (hours)
0,Jan,46-52,1.4-3.7,47-76
1,Feb,45-52,1.3-3.2,68-102
2,Mar,50-56,1.0-2.4,86-145
3,Apr,56-62,0.6-2.1,150-210
4,May,63-69,1.1-2.8,179-232
5,Jun,67-75,1.0-3.3,147-232
6,Jul,72-79,0.6-2.5,160-245
7,Aug,72-78,0.9-2.8,149-221
8,Sep,65-71,0.3-2.7,122-165
9,Oct,59-64,1.1-4.4,97-121
