In [1]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from getpass import getpass
import numpy as np

In [2]:
# Loading the weather data from the databases
# Reading the data from our database
# Debugging with syntax from here:  https://stackoverflow.com/questions/23839656/sqlalchemy-no-password-supplied-error
password = getpass('Enter database password')
london_weather_df = pd.read_sql_table('london_weather_yyyy_mm', 
                                      f'postgresql://postgres:{password}@localhost/Final_Project_Travel')
nyc_weather_df = pd.read_sql_table('nyc_weather_yyyy_mm',
                                  f'postgresql://postgres:{password}@localhost/Final_Project_Travel')

Enter database password········


In [3]:
# Looking at the London weather
london_weather_df

Unnamed: 0,index,year,month_num,avg_high_temp_f,avg_low_temp_f,total_rainfall_inches,days_of_air_frost,total_sunshine_duration_hours
0,108,1957,1,47.66,36.86,1.555118,5,53.0
1,109,1957,2,48.20,37.22,2.748031,5,64.9
2,110,1957,3,57.02,42.26,1.000000,2,96.7
3,111,1957,4,57.56,41.36,0.224409,1,169.6
4,112,1957,5,61.16,43.70,0.838583,0,195.0
...,...,...,...,...,...,...,...,...
771,879,2021,4,55.58,37.22,0.283465,5,202.6
772,880,2021,5,61.70,44.96,3.330709,0,131.9
773,881,2021,6,72.50,55.94,3.472441,0,159.6
774,882,2021,7,75.56,58.82,2.409449,0,171.1


In [4]:
# Looking at the NYC weather
nyc_weather_df

Unnamed: 0,index,year,month_num,avg_high_temp_f,avg_low_temp_f,avg_total_precipitation_inches,avg_total_snowfall_inches
0,0,1948,1,31.2,19.6,4.74,15.3
1,1,1948,2,37.8,23.6,2.52,13.6
2,2,1948,3,50.6,33.5,3.51,4.8
3,3,1948,4,58.9,43.1,3.26,0.0
4,4,1948,5,67.6,52.8,7.58,0.0
...,...,...,...,...,...,...,...
879,879,2021,4,63.7,45.5,2.69,0.0
880,880,2021,5,71.7,54.0,4.36,0.0
881,881,2021,6,82.5,66.0,2.62,0.0
882,882,2021,7,83.0,69.0,11.09,0.0


In [5]:
# Dropping the index columns
london_weather_df = london_weather_df.drop(columns=['index'], axis=1)
nyc_weather_df = nyc_weather_df.drop(columns=['index'], axis=1)

In [6]:
# Checking for null values
print(london_weather_df.isnull().sum())
print(nyc_weather_df.isnull().sum())

year                             0
month_num                        0
avg_high_temp_f                  0
avg_low_temp_f                   0
total_rainfall_inches            0
days_of_air_frost                0
total_sunshine_duration_hours    0
dtype: int64
year                              0
month_num                         0
avg_high_temp_f                   0
avg_low_temp_f                    0
avg_total_precipitation_inches    0
avg_total_snowfall_inches         0
dtype: int64


In [7]:
# Checking our datatypes
print(london_weather_df.dtypes)
print(nyc_weather_df.dtypes)

year                               int64
month_num                          int64
avg_high_temp_f                  float64
avg_low_temp_f                   float64
total_rainfall_inches            float64
days_of_air_frost                  int64
total_sunshine_duration_hours    float64
dtype: object
year                                int64
month_num                           int64
avg_high_temp_f                   float64
avg_low_temp_f                    float64
avg_total_precipitation_inches    float64
avg_total_snowfall_inches         float64
dtype: object


# Linear regression model avg high temp vs. year

In [8]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
high_temp_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        y = city_month_weather_df.avg_high_temp_f

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

        # Plotting and saving the predictions vs. the datapoints
        plt.scatter(X_train, y_train, color='blue', label='train')
        plt.scatter(X_test, y_test, color='green', label='test')
        plt.plot(X_test, y_pred, color='red', label='predict')
        plt.xlabel('year')
        plt.ylabel('avg high temp (F)')
        plt.legend()
        if city == 'london':
            plt.title('London ' + str(i) + ' avg high temps vs. model')
            plt.savefig('images/high_temp/london_'+ str(i) + '_avg_high_temps.png')
        else:
            plt.title('NYC ' + str(i) + ' avg high temps vs. model')
            plt.savefig('images/high_temp/nyc_'+ str(i) + '_avg_high_temps.png')
        # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
        plt.clf()

        # Viewing our coefficient and intercept
        #print(model.coef_)
        #print(model.intercept_)
        #print(f'{city} month = {i}')
        #print('---')

        # Adding the values to the list
        high_temp_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'high_temp',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'rmse': rmse
        })

# Creating a DataFrame from our results
high_temp_model_df = pd.DataFrame(high_temp_model_outputs)

# Adding the predictions to the high temp DataFrame
high_temp_model_df['2022_prediction_F'] = (2022 * high_temp_model_df['coef']) + high_temp_model_df['intercept']
high_temp_model_df['2023_prediction_F'] = (2023 * high_temp_model_df['coef']) + high_temp_model_df['intercept']

# # Formatting the columns
# high_temp_model_df['coef'] = high_temp_model_df['coef'].map('{:.2f}'.format)
# high_temp_model_df['intercept'] = high_temp_model_df['intercept'].map('{:.2f}'.format)
# high_temp_model_df['mae'] = high_temp_model_df['mae'].map('{:.2f}'.format)
# #high_temp_model_df['mse'] = high_temp_model_df['mse'].map('{:.2f}'.format)
# high_temp_model_df['rmse'] = high_temp_model_df['rmse'].map('{:.2f}'.format)
# high_temp_model_df['2022_prediction_F'] = high_temp_model_df['2022_prediction_F'].map('{:.2f}'.format)
# high_temp_model_df['2023_prediction_F'] = high_temp_model_df['2023_prediction_F'].map('{:.2f}'.format)

# Displaying the updated DataFrame
high_temp_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_F,2023_prediction_F
0,london,1,high_temp,0.077882,-108.617683,2.439848,10.614476,3.257987,48.859843,48.937725
1,london,2,high_temp,0.059482,-71.135702,3.38425,20.496457,4.527301,49.136665,49.196147
2,london,3,high_temp,0.08959,-126.25912,2.761359,13.564013,3.682935,54.892227,54.981818
3,london,4,high_temp,0.078705,-99.101521,1.895744,6.938179,2.634042,60.040322,60.119027
4,london,5,high_temp,0.06634,-68.075585,2.323695,7.752625,2.784354,66.063862,66.130201
5,london,6,high_temp,0.055237,-39.846396,2.658678,10.062551,3.172152,71.842317,71.897553
6,london,7,high_temp,0.095562,-116.048628,2.629484,10.343325,3.216104,77.177292,77.272854
7,london,8,high_temp,0.08088,-88.332918,2.927608,13.78047,3.712206,75.207197,75.288077
8,london,9,high_temp,0.028896,10.162671,1.810771,4.740027,2.17716,68.59023,68.619125
9,london,10,high_temp,0.033718,-7.112465,2.229176,7.210723,2.685279,61.065707,61.099426


<Figure size 432x288 with 0 Axes>

# Linear regression model total precip/rainfall vs. year

In [9]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
total_rainfall_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        if city == 'london':
            y = city_month_weather_df.total_rainfall_inches
        else:
            y = city_month_weather_df.avg_total_precipitation_inches

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

        # Plotting and saving the predictions vs. the datapoints
        plt.scatter(X_train, y_train, color='blue', label='train')
        plt.scatter(X_test, y_test, color='green', label='test')
        plt.plot(X_test, y_pred, color='red', label='predict')
        plt.xlabel('year')
        plt.ylabel('total rainfall (inches)')
        plt.legend()
        if city == 'london':
            plt.title('London ' + str(i) + ' total rainfall vs. model')
            plt.savefig('images/rainfall/london_'+ str(i) + '_total_rainfall.png')
        else:
            plt.title('NYC ' + str(i) + ' total rainfall vs. model')
            plt.savefig('images/rainfall/nyc_'+ str(i) + '_total_rainfall.png')
        # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
        plt.clf()

        # Viewing our coefficient and intercept
        #print(model.coef_)
        #print(model.intercept_)
        #print(f'{city} month = {i}')
        #print('---')

        # Adding the values to the list
        total_rainfall_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'rainfall',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'rmse': rmse
        })
        
# Creating a DataFrame from our results
total_rainfall_model_df = pd.DataFrame(total_rainfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_rainfall_model_df['2022_prediction_inches'] = (2022 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']
total_rainfall_model_df['2023_prediction_inches'] = (2023 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']

# # Formatting the columns
# total_rainfall_model_df['coef'] = total_rainfall_model_df['coef'].map('{:.2f}'.format)
# total_rainfall_model_df['intercept'] = total_rainfall_model_df['intercept'].map('{:.2f}'.format)
# total_rainfall_model_df['mae'] = total_rainfall_model_df['mae'].map('{:.2f}'.format)
# #total_rainfall_model_df['mse'] = total_rainfall_model_df['mse'].map('{:.2f}'.format)
# total_rainfall_model_df['rmse'] = total_rainfall_model_df['rmse'].map('{:.2f}'.format)
# total_rainfall_model_df['2022_prediction_inches'] = total_rainfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
# total_rainfall_model_df['2023_prediction_inches'] = total_rainfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_rainfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_inches,2023_prediction_inches
0,london,1,rainfall,0.011534,-20.856366,0.831153,1.126234,1.061242,2.464805,2.476339
1,london,2,rainfall,0.006339,-11.070961,0.821508,1.040862,1.020226,1.746833,1.753172
2,london,3,rainfall,-0.004419,10.531658,0.781575,0.923165,0.960815,1.596798,1.592379
3,london,4,rainfall,-0.006533,14.629194,0.89681,1.193769,1.092597,1.419928,1.413395
4,london,5,rainfall,-0.00433,10.482137,0.987959,1.180913,1.086698,1.726934,1.722604
5,london,6,rainfall,-0.001083,3.987372,1.139728,1.879246,1.370856,1.797374,1.796291
6,london,7,rainfall,-0.008209,18.229366,0.740048,0.699958,0.836635,1.63091,1.622701
7,london,8,rainfall,0.010175,-18.106238,0.897344,1.606723,1.267566,2.466913,2.477087
8,london,9,rainfall,-0.01214,26.115232,0.902864,1.278726,1.130808,1.568707,1.556567
9,london,10,rainfall,0.003239,-3.902302,1.174408,2.17305,1.474127,2.646314,2.649553


<Figure size 432x288 with 0 Axes>

# Linear regression model NYC snowfall vs. Year

In [10]:
# Looping through the months for NYC, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_snowfall_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for NYC for each month
    city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.avg_total_snowfall_inches

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    # Plotting and saving the predictions vs. the datapoints
    plt.scatter(X_train, y_train, color='blue', label='train')
    plt.scatter(X_test, y_test, color='green', label='test')
    plt.plot(X_test, y_pred, color='red', label='predict')
    plt.xlabel('year')
    plt.ylabel('total snowfall (inches)')
    plt.legend()
    plt.title('NYC ' + str(i) + ' total snowfall vs. model')
    plt.savefig('images/snowfall/nyc_'+ str(i) + '_total_snowfall.png')
    # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
    plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    total_snowfall_model_outputs.append({
        'city': 'NYC',
        'month_num': i,
        'weather_factor': 'snowfall',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
total_snowfall_model_df = pd.DataFrame(total_snowfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_snowfall_model_df['2022_prediction_inches'] = (2022 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']
total_snowfall_model_df['2023_prediction_inches'] = (2023 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']

# # Formatting the columns
# total_snowfall_model_df['coef'] = total_snowfall_model_df['coef'].map('{:.2f}'.format)
# total_snowfall_model_df['intercept'] = total_snowfall_model_df['intercept'].map('{:.2f}'.format)
# total_snowfall_model_df['mae'] = total_snowfall_model_df['mae'].map('{:.2f}'.format)
# #total_snowfall_model_df['mse'] = total_snowfall_model_df['mse'].map('{:.2f}'.format)
# total_snowfall_model_df['rmse'] = total_snowfall_model_df['rmse'].map('{:.2f}'.format)
# total_snowfall_model_df['2022_prediction_inches'] = total_snowfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
# total_snowfall_model_df['2023_prediction_inches'] = total_snowfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_snowfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_inches,2023_prediction_inches
0,NYC,1,snowfall,0.009335,-12.192067,7.695613,102.934137,10.145646,6.684203,6.693539
1,NYC,2,snowfall,0.099996,-189.464914,7.583716,72.952799,8.541241,12.727133,12.827129
2,NYC,3,snowfall,-0.01443,32.605908,5.183339,46.225698,6.798948,3.427959,3.413529
3,NYC,4,snowfall,-0.009632,19.478666,0.899917,2.691582,1.640604,0.00286,-0.006772
4,NYC,5,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,NYC,6,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,NYC,7,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NYC,8,snowfall,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,NYC,9,snowfall,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,NYC,10,snowfall,0.002381,-4.659171,0.070372,0.006307,0.079418,0.15498,0.157361


<Figure size 432x288 with 0 Axes>

# Linear regression model London sunshine hours vs. Year

In [11]:
# Looping through the months for London, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_sunshine_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for London for each month
    city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.total_sunshine_duration_hours

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    # Plotting and saving the predictions vs. the datapoints
    plt.scatter(X_train, y_train, color='blue', label='train')
    plt.scatter(X_test, y_test, color='green', label='test')
    plt.plot(X_test, y_pred, color='red', label='predict')
    plt.xlabel('year')
    plt.ylabel('total sunshine (hours)')
    plt.legend()
    plt.title('London ' + str(i) + ' total sunshine vs. model')
    plt.savefig('images/sunshine/london_'+ str(i) + '_total_sunshine.png')
    # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
    plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    total_sunshine_model_outputs.append({
        'city': 'London',
        'month_num': i,
        'weather_factor': 'sunshine',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
total_sunshine_model_df = pd.DataFrame(total_sunshine_model_outputs)

# Adding the predictions to the high temp DataFrame
total_sunshine_model_df['2022_prediction_hours'] = (2022 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']
total_sunshine_model_df['2023_prediction_hours'] = (2023 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']

# # Formatting the columns
# total_sunshine_model_df['coef'] = total_sunshine_model_df['coef'].map('{:.2f}'.format)
# total_sunshine_model_df['intercept'] = total_sunshine_model_df['intercept'].map('{:.2f}'.format)
# total_sunshine_model_df['mae'] = total_sunshine_model_df['mae'].map('{:.2f}'.format)
# #total_sunshine_model_df['mse'] = total_sunshine_model_df['mse'].map('{:.2f}'.format)
# total_sunshine_model_df['rmse'] = total_sunshine_model_df['rmse'].map('{:.2f}'.format)
# total_sunshine_model_df['2022_prediction_hours'] = total_sunshine_model_df['2022_prediction_hours'].map('{:.2f}'.format)
# total_sunshine_model_df['2023_prediction_hours'] = total_sunshine_model_df['2023_prediction_hours'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_sunshine_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_hours,2023_prediction_hours
0,London,1,sunshine,0.120274,-182.463104,13.86391,300.151419,17.324879,60.730067,60.85034
1,London,2,sunshine,0.429017,-781.51221,20.229841,582.872008,24.142742,85.959585,86.388601
2,London,3,sunshine,0.235691,-353.82653,24.396756,984.474283,31.376333,122.740339,122.976029
3,London,4,sunshine,0.726119,-1288.537194,29.665234,1192.214361,34.528457,179.675959,180.402079
4,London,5,sunshine,-0.192385,576.437234,33.16449,2010.393592,44.837413,187.434991,187.242606
5,London,6,sunshine,-0.511877,1219.196198,34.210826,1444.028519,38.000375,184.181536,183.669659
6,London,7,sunshine,0.187017,-168.955672,29.828688,1168.631578,34.185254,209.191693,209.37871
7,London,8,sunshine,-0.056244,298.725857,31.266808,1730.371328,41.597732,185.001116,184.944872
8,London,9,sunshine,0.266564,-377.208205,28.177941,963.61121,31.042088,161.784591,162.051155
9,London,10,sunshine,0.00123,105.034957,19.510241,646.581267,25.427962,107.522392,107.523622


<Figure size 432x288 with 0 Axes>

In [12]:
# One high_temp model for each city with all months

# Creating empty list to hold the outputs of our model
combined_high_temp_model_outputs = []

# Looping through each city
for city in cities:
    if city == 'london':
        city_month_weather_df = london_weather_df
    else:
        city_month_weather_df = nyc_weather_df
        
    # Preparing the data for Scikit-learn library
    X = city_month_weather_df[['year', 'month_num']]

    # Assigning the target variable
    y = city_month_weather_df.avg_high_temp_f

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#     # Plotting and saving the predictions vs. the datapoints
#     plt.scatter(X_train, y_train, color='blue', label='train')
#     plt.scatter(X_test, y_test, color='green', label='test')
#     plt.plot(X_test, y_pred, color='red', label='predict')
#     plt.xlabel('year')
#     plt.ylabel('avg high temp (F)')
#     plt.legend()
#     if city == 'london':
#         plt.title('London ' + str(i) + ' avg high temps vs. model')
#         plt.savefig('images/high_temp/london_'+ str(i) + '_avg_high_temps.png')
#     else:
#         plt.title('NYC ' + str(i) + ' avg high temps vs. model')
#         plt.savefig('images/high_temp/nyc_'+ str(i) + '_avg_high_temps.png')
#     # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
#     plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    combined_high_temp_model_outputs.append({
        'city': city,
        #'month_num': i,
        'weather_factor': 'high_temp',
        'coef_year': model.coef_[0],
        'coef_month_num': model.coef_[1],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
combined_high_temp_model_df = pd.DataFrame(combined_high_temp_model_outputs)

# Adding the predictions to the high temp DataFrame
#combined_high_temp_model_df['2022_prediction_F'] = (2022 * combined_high_temp_model_df['coef_year']) + (combined_high_temp_model_df['month_num'] * combined_high_temp_model_df['coef_month_num']) + combined_high_temp_model_df['intercept']
#combined_high_temp_model_df['2023_prediction_F'] = (2023 * combined_high_temp_model_df['coef_year']) + (combined_high_temp_model_df['month_num'] * combined_high_temp_model_df['coef_month_num']) + combined_high_temp_model_df['intercept']

# Formatting the columns
combined_high_temp_model_df['coef_year'] = combined_high_temp_model_df['coef_year'].map('{:.2f}'.format)
combined_high_temp_model_df['coef_month_num'] = combined_high_temp_model_df['coef_month_num'].map('{:.2f}'.format)                                                                                                        
combined_high_temp_model_df['intercept'] = combined_high_temp_model_df['intercept'].map('{:.2f}'.format)
combined_high_temp_model_df['mae'] = combined_high_temp_model_df['mae'].map('{:.2f}'.format)
combined_high_temp_model_df['mse'] = combined_high_temp_model_df['mse'].map('{:.2f}'.format)
combined_high_temp_model_df['rmse'] = combined_high_temp_model_df['rmse'].map('{:.2f}'.format)
#combined_high_temp_model_df['2022_prediction_F'] = combined_high_temp_model_df['2022_prediction_F'].map('{:.2f}'.format)
#combined_high_temp_model_df['2023_prediction_F'] = combined_high_temp_model_df['2023_prediction_F'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_high_temp_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,london,high_temp,0.07,0.61,-78.53,9.07,107.43,10.36
1,nyc,high_temp,-0.01,1.19,67.98,14.05,253.32,15.92


In [13]:
# One rainfall model for each city for all months
# Looks like this model performs slightly better or about the same overall vs. the model above where rainfall data is broken out by months

# Creating empty list to hold the outputs of our model
combined_total_rainfall_model_outputs = []

# Looping through each city
for city in cities:
    # Splitting out the weather for that city
    if city == 'london':
        city_month_weather_df = london_weather_df
    else:
        city_month_weather_df = nyc_weather_df

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df[['year', 'month_num']]

    # Assigning the target variable
    if city == 'london':
        y = city_month_weather_df.total_rainfall_inches
    else:
        y = city_month_weather_df.avg_total_precipitation_inches

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#     # Plotting and saving the predictions vs. the datapoints
#     plt.scatter(X_train, y_train, color='blue', label='train')
#     plt.scatter(X_test, y_test, color='green', label='test')
#     plt.plot(X_test, y_pred, color='red', label='predict')
#     plt.xlabel('year')
#     plt.ylabel('total rainfall (inches)')
#     plt.legend()
#     if city == 'london':
#         plt.title('London ' + str(i) + ' total rainfall vs. model')
#         plt.savefig('images/rainfall/london_'+ str(i) + '_total_rainfall.png')
#     else:
#         plt.title('NYC ' + str(i) + ' total rainfall vs. model')
#         plt.savefig('images/rainfall/nyc_'+ str(i) + '_total_rainfall.png')
#     # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
#     plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    combined_total_rainfall_model_outputs.append({
        'city': city,
        #'month_num': i,
        'weather_factor': 'rainfall',
        'coef_year': model.coef_[0],
        'coef_month_num': model.coef_[1],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })
        
# Creating a DataFrame from our results
combined_total_rainfall_model_df = pd.DataFrame(combined_total_rainfall_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_rainfall_model_df['2022_prediction_inches'] = (2022 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']
#total_rainfall_model_df['2023_prediction_inches'] = (2023 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']

# Formatting the columns
combined_total_rainfall_model_df['coef_year'] = combined_total_rainfall_model_df['coef_year'].map('{:.2f}'.format)
combined_total_rainfall_model_df['coef_month_num'] = combined_total_rainfall_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_rainfall_model_df['intercept'] = combined_total_rainfall_model_df['intercept'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['mae'] = combined_total_rainfall_model_df['mae'].map('{:.2f}'.format)
combined_total_rainfall_model_df['mse'] = combined_total_rainfall_model_df['mse'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['rmse'] = combined_total_rainfall_model_df['rmse'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['2022_prediction_inches'] = combined_total_rainfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['2023_prediction_inches'] = combined_total_rainfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_rainfall_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,london,rainfall,-0.0,0.06,5.5,0.966488,1.52,1.234033
1,nyc,rainfall,0.02,0.05,-28.41,1.666281,4.46,2.112967


In [14]:
# One model for all months snowfall NYC
# Model above with one for each month performs better

# Creating empty list to hold the outputs of our model
combined_total_snowfall_model_outputs = []

# Splitting out the weather for NYC for each month
city_month_weather_df = nyc_weather_df

# Preparing the data for Scikit-learn library
X = city_month_weather_df[['year', 'month_num']]

# Assigning the target variable
y = city_month_weather_df.avg_total_snowfall_inches

# Creating the model from the class
model = LinearRegression()

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training the model
model.fit(X_train, y_train)

# Generating the predictions
y_pred = model.predict(X_test)

# Evaluating the performance
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# # Plotting and saving the predictions vs. the datapoints
# plt.scatter(X_train, y_train, color='blue', label='train')
# plt.scatter(X_test, y_test, color='green', label='test')
# plt.plot(X_test, y_pred, color='red', label='predict')
# plt.xlabel('year')
# plt.ylabel('total snowfall (inches)')
# plt.legend()
# plt.title('NYC ' + str(i) + ' total snowfall vs. model')
# plt.savefig('images/snowfall/nyc_'+ str(i) + '_total_snowfall.png')
# # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
# plt.clf()

# Viewing our coefficient and intercept
#print(model.coef_)
#print(model.intercept_)
#print(f'{city} month = {i}')
#print('---')

# Adding the values to the list
combined_total_snowfall_model_outputs.append({
    'city': 'NYC',
    #'month_num': i,
    'weather_factor': 'snowfall',
    'coef_year': model.coef_[0],
    'coef_month_num': model.coef_[1],
    'intercept': model.intercept_,
    'mae': mae,
    'mse': mse,
    'rmse': rmse
})

# Creating a DataFrame from our results
combined_total_snowfall_model_df = pd.DataFrame(combined_total_snowfall_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_snowfall_model_df['2022_prediction_inches'] = (2022 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']
#total_snowfall_model_df['2023_prediction_inches'] = (2023 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']

# Formatting the columns
combined_total_snowfall_model_df['coef_year'] = combined_total_snowfall_model_df['coef_year'].map('{:.2f}'.format)
combined_total_snowfall_model_df['coef_month_num'] = combined_total_snowfall_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_snowfall_model_df['intercept'] = combined_total_snowfall_model_df['intercept'].map('{:.2f}'.format)
combined_total_snowfall_model_df['mae'] = combined_total_snowfall_model_df['mae'].map('{:.2f}'.format)
combined_total_snowfall_model_df['mse'] = combined_total_snowfall_model_df['mse'].map('{:.2f}'.format)
combined_total_snowfall_model_df['rmse'] = combined_total_snowfall_model_df['rmse'].map('{:.2f}'.format)
#combined_total_snowfall_model_df['2022_prediction_inches'] = combined_total_snowfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_snowfall_model_df['2023_prediction_inches'] = combined_total_snowfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_snowfall_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,NYC,snowfall,0.01,-0.52,-8.27,2.96,20.42,4.52


In [15]:
# One model for London sunshine data for all months
# Model above broken out by months performs better than combined model

# Creating empty list to hold the outputs of our model
combined_total_sunshine_model_outputs = []

city_month_weather_df = london_weather_df

# Preparing the data for Scikit-learn library
X = city_month_weather_df[['year', 'month_num']]

# Assigning the target variable
y = city_month_weather_df.total_sunshine_duration_hours

# Creating the model from the class
model = LinearRegression()

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training the model
model.fit(X_train, y_train)

# Generating the predictions
y_pred = model.predict(X_test)

# Evaluating the performance
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# # Plotting and saving the predictions vs. the datapoints
# plt.scatter(X_train, y_train, color='blue', label='train')
# plt.scatter(X_test, y_test, color='green', label='test')
# plt.plot(X_test, y_pred, color='red', label='predict')
# plt.xlabel('year')
# plt.ylabel('total sunshine (hours)')
# plt.legend()
# plt.title('London ' + str(i) + ' total sunshine vs. model')
# plt.savefig('images/sunshine/london_'+ str(i) + '_total_sunshine.png')
# # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
# plt.clf()

# Viewing our coefficient and intercept
#print(model.coef_)
#print(model.intercept_)
#print(f'{city} month = {i}')
#print('---')

# Adding the values to the list
combined_total_sunshine_model_outputs.append({
    'city': 'London',
    #'month_num': i,
    'weather_factor': 'sunshine',
    'coef_year': model.coef_[0],
    'coef_month_num': model.coef_[1],
    'intercept': model.intercept_,
    'mae': mae,
    'mse': mse,
    'rmse': rmse
})

# Creating a DataFrame from our results
combined_total_sunshine_model_df = pd.DataFrame(combined_total_sunshine_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_sunshine_model_df['2022_prediction_inches'] = (2022 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']
#total_sunshine_model_df['2023_prediction_inches'] = (2023 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']

# Formatting the columns
combined_total_sunshine_model_df['coef_year'] = combined_total_sunshine_model_df['coef_year'].map('{:.2f}'.format)
combined_total_sunshine_model_df['coef_month_num'] = combined_total_sunshine_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_sunshine_model_df['intercept'] = combined_total_sunshine_model_df['intercept'].map('{:.2f}'.format)
combined_total_sunshine_model_df['mae'] = combined_total_sunshine_model_df['mae'].map('{:.2f}'.format)
combined_total_sunshine_model_df['mse'] = combined_total_sunshine_model_df['mse'].map('{:.2f}'.format)
combined_total_sunshine_model_df['rmse'] = combined_total_sunshine_model_df['rmse'].map('{:.2f}'.format)
#combined_total_sunshine_model_df['2022_prediction_inches'] = combined_total_sunshine_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_sunshine_model_df['2023_prediction_inches'] = combined_total_sunshine_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_sunshine_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,London,sunshine,0.16,-0.05,-193.32,51.76,3883.34,62.32


In [16]:
# Calculating mean MSE from sunshine model above
total_sunshine_model_df.groupby(['city']).mean()['mse']

city
London    955.605627
Name: mse, dtype: float64

In [17]:
# Calculating mean MSE from high temp model above
high_temp_model_df.groupby(['city']).mean()['mse']

city
london    10.181108
nyc        9.909646
Name: mse, dtype: float64

In [18]:
# Calculating mean MSE from total precip model above
total_rainfall_model_df.groupby(['city']).mean()['mse']

city
london    1.282176
nyc       5.062090
Name: mse, dtype: float64

In [19]:
# Calculating mean MSE from total snowfall model
total_snowfall_model_df.groupby(['city']).mean()['mse']

city
NYC    20.584886
Name: mse, dtype: float64

In [20]:
# Creating a DataFrame to hold the predicted ranges
predictions_df = high_temp_model_df[['city', 'month_num', '2022_prediction_F', '2023_prediction_F']]
predictions_df

Unnamed: 0,city,month_num,2022_prediction_F,2023_prediction_F
0,london,1,48.859843,48.937725
1,london,2,49.136665,49.196147
2,london,3,54.892227,54.981818
3,london,4,60.040322,60.119027
4,london,5,66.063862,66.130201
5,london,6,71.842317,71.897553
6,london,7,77.177292,77.272854
7,london,8,75.207197,75.288077
8,london,9,68.59023,68.619125
9,london,10,61.065707,61.099426


In [21]:
high_temp_model_df.columns

Index(['city', 'month_num', 'weather_factor', 'coef', 'intercept', 'mae',
       'mse', 'rmse', '2022_prediction_F', '2023_prediction_F'],
      dtype='object')

In [22]:
# High Temp Predictions
# Adding columns to hold the prediction ranges
predictions_df['min_pred_2022_F'] = (predictions_df['2022_prediction_F'] - high_temp_model_df['mae'])
predictions_df['max_pred_2022_F'] = (predictions_df['2022_prediction_F'] + high_temp_model_df['mae'])
predictions_df['min_pred_2023_F'] = (predictions_df['2023_prediction_F'] - high_temp_model_df['mae'])
predictions_df['max_pred_2023_F'] = (predictions_df['2023_prediction_F'] + high_temp_model_df['mae'])

# Rounding the min predictions down and the max predictions up
predictions_df['min_pred_2022_F'] = np.floor(predictions_df['min_pred_2022_F'])
predictions_df['min_pred_2023_F'] = np.floor(predictions_df['min_pred_2023_F'])
predictions_df['max_pred_2023_F'] = np.ceil(predictions_df['max_pred_2023_F'])
predictions_df['max_pred_2022_F'] = np.ceil(predictions_df['max_pred_2022_F'])

# Formatting the columns
predictions_df['min_pred_2022_F'] = predictions_df['min_pred_2022_F'].map('{:.0f}'.format)
predictions_df['min_pred_2023_F'] = predictions_df['min_pred_2023_F'].map('{:.0f}'.format)
predictions_df['max_pred_2022_F'] = predictions_df['max_pred_2022_F'].map('{:.0f}'.format)
predictions_df['max_pred_2023_F'] = predictions_df['max_pred_2023_F'].map('{:.0f}'.format)

# Concatenating into prediction columns
predictions_df['Predicted avg high temp 2022 (F)'] = (predictions_df['min_pred_2022_F'] + '-' 
                                                      + predictions_df['max_pred_2022_F'])
predictions_df['Predicted avg high temp 2023 (F)'] = (predictions_df['min_pred_2023_F'] + '-' 
                                                      + predictions_df['max_pred_2023_F'])

# Dropping the unneeded columns
predictions_df = predictions_df.drop(columns=['2022_prediction_F', '2023_prediction_F', 'min_pred_2022_F', 'max_pred_2022_F',
                            'min_pred_2023_F', 'max_pred_2023_F'], axis=1)

predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F)
0,london,1,46-52,46-52
1,london,2,45-53,45-53
2,london,3,52-58,52-58
3,london,4,58-62,58-63
4,london,5,63-69,63-69
5,london,6,69-75,69-75
6,london,7,74-80,74-80
7,london,8,72-79,72-79
8,london,9,66-71,66-71
9,london,10,58-64,58-64


In [23]:
# Total precip predictions

# Adding columns to hold the prediction ranges
predictions_df['min_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] - total_rainfall_model_df['mae'])
predictions_df['max_precip_2022'] = (total_rainfall_model_df['2022_prediction_inches'] + total_rainfall_model_df['mae'])
predictions_df['min_precip_2023'] = (total_rainfall_model_df['2023_prediction_inches'] - total_rainfall_model_df['mae'])
predictions_df['max_precip_2023'] = (total_rainfall_model_df['2023_prediction_inches'] + total_rainfall_model_df['mae'])

# # Rounding the predictions
predictions_df['min_precip_2022'] = predictions_df['min_precip_2022'].round(1)
predictions_df['max_precip_2022'] = predictions_df['max_precip_2022'].round(1)
predictions_df['min_precip_2023'] = predictions_df['min_precip_2023'].round(1)
predictions_df['max_precip_2023'] = predictions_df['max_precip_2023'].round(1)

# # Concatenating into prediction columns
# Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
predictions_df['Predicted total rainfall 2022 (inches)'] = (predictions_df['min_precip_2022'].apply(str) + '-' 
                                                            + predictions_df['max_precip_2022'].apply(str))
predictions_df['Predicted total rainfall 2023 (inches)'] = (predictions_df['min_precip_2023'].apply(str) + '-' 
                                                            + predictions_df['max_precip_2023'].apply(str))

# # Dropping the unneeded columns
predictions_df = predictions_df.drop(columns=['min_precip_2022', 'max_precip_2022', 
                                              'min_precip_2023', 'max_precip_2023'], axis=1)

predictions_df

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F),Predicted total rainfall 2022 (inches),Predicted total rainfall 2023 (inches)
0,london,1,46-52,46-52,1.6-3.3,1.6-3.3
1,london,2,45-53,45-53,0.9-2.6,0.9-2.6
2,london,3,52-58,52-58,0.8-2.4,0.8-2.4
3,london,4,58-62,58-63,0.5-2.3,0.5-2.3
4,london,5,63-69,63-69,0.7-2.7,0.7-2.7
5,london,6,69-75,69-75,0.7-2.9,0.7-2.9
6,london,7,74-80,74-80,0.9-2.4,0.9-2.4
7,london,8,72-79,72-79,1.6-3.4,1.6-3.4
8,london,9,66-71,66-71,0.7-2.5,0.7-2.5
9,london,10,58-64,58-64,1.5-3.8,1.5-3.8


In [24]:
# Separating prediction DF by city
london_predictions_df = predictions_df.loc[(predictions_df['city'] == 'london')]
nyc_predictions_df = predictions_df.loc[(predictions_df['city'] == 'nyc')]

In [25]:
# Debugging error creating nyc snowfall predictions with workaround
min_snow_2022 = total_snowfall_model_df['2022_prediction_inches'] - total_snowfall_model_df['mae']
max_snow_2022 = total_snowfall_model_df['2022_prediction_inches'] + total_snowfall_model_df['mae']
min_snow_2023 = total_snowfall_model_df['2023_prediction_inches'] - total_snowfall_model_df['mae']
max_snow_2023 = total_snowfall_model_df['2023_prediction_inches'] + total_snowfall_model_df['mae']

In [26]:
# Replacing negative values in min snow series with 0
min_snow_2022

0    -1.011410
1     5.143416
2    -1.755379
3    -0.897056
4     0.000000
5     0.000000
6     0.000000
7     0.000000
8     0.000000
9     0.084607
10   -0.571349
11    0.313945
dtype: float64

In [27]:
# Total snowfall

# Adding columns to hold the prediction ranges
nyc_predictions_df['min_snow_2022'] = min_snow_2022.tolist()
nyc_predictions_df['max_snow_2022'] = max_snow_2022.tolist()
nyc_predictions_df['min_snow_2023'] = min_snow_2023.tolist()
nyc_predictions_df['max_snow_2023'] = max_snow_2023.tolist()

# Replacing negative numbers in min snow columns with 0
# Adapting code to replace negative numbers with 0, found here:  https://stackoverflow.com/questions/49681363/replace-negative-values-in-single-dataframe-column
nyc_predictions_df['min_snow_2022'][nyc_predictions_df['min_snow_2022'] < 0] = 0
nyc_predictions_df['min_snow_2023'][nyc_predictions_df['min_snow_2023'] < 0] = 0

# Rounding the predictions to the nearest tenth
nyc_predictions_df['min_snow_2022'] = nyc_predictions_df['min_snow_2022'].round(1) 
nyc_predictions_df['max_snow_2022'] = nyc_predictions_df['max_snow_2022'].round(1) 
nyc_predictions_df['min_snow_2023'] = nyc_predictions_df['min_snow_2023'].round(1) 
nyc_predictions_df['max_snow_2023'] = nyc_predictions_df['max_snow_2023'].round(1) 

# Concatenating into prediction columns
# Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
nyc_predictions_df['Predicted total snowfall 2022 (inches)'] = (nyc_predictions_df['min_snow_2022'].apply(str) + '-'
                                                               + nyc_predictions_df['max_snow_2022'].apply(str))
nyc_predictions_df['Predicted total snowfall 2023(inches)'] = (nyc_predictions_df['min_snow_2023'].apply(str) + '-'
                                                               + nyc_predictions_df['max_snow_2023'].apply(str))

# Dropping the unneeded columns
nyc_predictions_df = nyc_predictions_df.drop(columns=['min_snow_2022', 'max_snow_2022', 'min_snow_2023', 'max_snow_2023'])

# Displaying the updted DataFrame
nyc_predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F),Predicted total rainfall 2022 (inches),Predicted total rainfall 2023 (inches),Predicted total snowfall 2022 (inches),Predicted total snowfall 2023(inches)
12,nyc,1,36-43,36-43,2.5-4.8,2.5-4.8,0.0-14.4,0.0-14.4
13,nyc,2,39-46,39-46,1.8-4.5,1.8-4.5,5.1-20.3,5.2-20.4
14,nyc,3,48-55,48-55,3.5-5.8,3.5-5.8,0.0-8.6,0.0-8.6
15,nyc,4,60-65,60-65,2.8-5.8,2.9-5.8,0.0-0.9,0.0-0.9
16,nyc,5,69-75,69-75,2.9-6.7,2.9-6.7,0.0-0.0,0.0-0.0
17,nyc,6,77-83,77-83,2.8-6.5,2.9-6.5,0.0-0.0,0.0-0.0
18,nyc,7,82-89,82-89,3.0-7.6,3.0-7.7,0.0-0.0,0.0-0.0
19,nyc,8,82-86,82-86,2.9-7.1,2.9-7.1,0.0-0.0,0.0-0.0
20,nyc,9,74-79,74-79,2.9-6.6,2.9-6.6,0.0-0.0,0.0-0.0
21,nyc,10,61-67,61-67,3.7-6.2,3.7-6.3,0.1-0.2,0.1-0.2


In [28]:
# Creating the month_dict
month_dict = {1: 'Jan',
             2: 'Feb',
             3: 'Mar',
             4: 'Apr',
             5: 'May',
             6: 'Jun',
             7: 'Jul',
             8: 'Aug',
             9: 'Sep',
             10: 'Oct',
             11: 'Nov',
             12: 'Dec'}

In [29]:
# Dropping the 2023 columns from the NYC DataFrame since the values are identical for nearly all values
nyc_predictions_df = nyc_predictions_df.drop(columns=['Predicted avg high temp 2023 (F)', 
                                                      'Predicted total rainfall 2023 (inches)', 
                                                      'Predicted total snowfall 2023(inches)'], axis=1)

# Adding the month name to the DataFrame
nyc_predictions_df['Month'] = nyc_predictions_df['month_num'].apply(lambda x:month_dict[x])

# Dropping the month num column
nyc_predictions_df = nyc_predictions_df.drop(columns=['month_num'], axis=1)

# Rearranging the columns
nyc_predictions_df = nyc_predictions_df[['Month', 'Predicted avg high temp 2022 (F)',
                                        'Predicted total rainfall 2022 (inches)', 
                                        'Predicted total snowfall 2022 (inches)']]

# Displaying the updated DataFrame
nyc_predictions_df

Unnamed: 0,Month,Predicted avg high temp 2022 (F),Predicted total rainfall 2022 (inches),Predicted total snowfall 2022 (inches)
12,Jan,36-43,2.5-4.8,0.0-14.4
13,Feb,39-46,1.8-4.5,5.1-20.3
14,Mar,48-55,3.5-5.8,0.0-8.6
15,Apr,60-65,2.8-5.8,0.0-0.9
16,May,69-75,2.9-6.7,0.0-0.0
17,Jun,77-83,2.8-6.5,0.0-0.0
18,Jul,82-89,3.0-7.6,0.0-0.0
19,Aug,82-86,2.9-7.1,0.0-0.0
20,Sep,74-79,2.9-6.6,0.0-0.0
21,Oct,61-67,3.7-6.2,0.1-0.2


In [30]:
# Total sunshine

# Adding the prediction ranges
london_predictions_df['min_sun_2022'] = (total_sunshine_model_df['2022_prediction_hours'] - total_sunshine_model_df['mae'])
london_predictions_df['max_sun_2022'] = (total_sunshine_model_df['2022_prediction_hours'] + total_sunshine_model_df['mae'])
london_predictions_df['min_sun_2023'] = (total_sunshine_model_df['2023_prediction_hours'] - total_sunshine_model_df['mae'])
london_predictions_df['max_sun_2023'] = (total_sunshine_model_df['2023_prediction_hours'] + total_sunshine_model_df['mae'])

# Rounding the min predictions down and the max predictions up
london_predictions_df['min_sun_2022'] = np.floor(london_predictions_df['min_sun_2022'])
london_predictions_df['max_sun_2022'] = np.ceil(london_predictions_df['max_sun_2022'])
london_predictions_df['min_sun_2023'] = np.floor(london_predictions_df['min_sun_2023'])
london_predictions_df['max_sun_2023'] = np.ceil(london_predictions_df['max_sun_2023'])

# Formatting the columns
london_predictions_df['min_sun_2022'] = london_predictions_df['min_sun_2022'].map('{:.0f}'.format)
london_predictions_df['max_sun_2022'] = london_predictions_df['max_sun_2022'].map('{:.0f}'.format)
london_predictions_df['min_sun_2023'] = london_predictions_df['min_sun_2023'].map('{:.0f}'.format)
london_predictions_df['max_sun_2023'] = london_predictions_df['max_sun_2023'].map('{:.0f}'.format)

# Concatenating the predictions
london_predictions_df['Predicted total sunshine 2022 (hours)'] = (london_predictions_df['min_sun_2022'].apply(str) + '-' 
                                                            + london_predictions_df['max_sun_2022'].apply(str))
london_predictions_df['Predicted total sunshine 2023 (hours)'] = (london_predictions_df['min_sun_2023'].apply(str) + '-' 
                                                            + london_predictions_df['max_sun_2023'].apply(str))

# Dropping the unneeded columns
london_predictions_df = london_predictions_df.drop(columns=['min_sun_2022', 'max_sun_2022', 'min_sun_2023', 'max_sun_2023'])

london_predictions_df

# # # Rounding the min predictions down and the max predictions up
# predictions_df['min_precip_2022'] = predictions_df['min_precip_2022'].round(1)
# predictions_df['max_precip_2022'] = predictions_df['max_precip_2022'].round(1)
# predictions_df['min_precip_2023'] = predictions_df['min_precip_2023'].round(1)
# predictions_df['max_precip_2023'] = predictions_df['max_precip_2023'].round(1)

# # # Concatenating into prediction columns
# # Debugging by adapting code found here:  https://stackoverflow.com/questions/44527956/python-ufunc-add-did-not-contain-a-loop-with-signature-matching-types-dtype
# predictions_df['Predicted total rainfall 2022 (inches)'] = (predictions_df['min_precip_2022'].apply(str) + '-' 
#                                                             + predictions_df['max_precip_2022'].apply(str))
# predictions_df['Predicted total rainfall 2023 (inches)'] = (predictions_df['min_precip_2023'].apply(str) + '-' 
#                                                             + predictions_df['max_precip_2023'].apply(str))

# # # Dropping the unneeded columns
# predictions_df = predictions_df.drop(columns=['min_precip_2022', 'max_precip_2022', 
#                                               'min_precip_2023', 'max_precip_2023'], axis=1)

# predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted avg high temp 2023 (F),Predicted total rainfall 2022 (inches),Predicted total rainfall 2023 (inches),Predicted total sunshine 2022 (hours),Predicted total sunshine 2023 (hours)
0,london,1,46-52,46-52,1.6-3.3,1.6-3.3,46-75,46-75
1,london,2,45-53,45-53,0.9-2.6,0.9-2.6,65-107,66-107
2,london,3,52-58,52-58,0.8-2.4,0.8-2.4,98-148,98-148
3,london,4,58-62,58-63,0.5-2.3,0.5-2.3,150-210,150-211
4,london,5,63-69,63-69,0.7-2.7,0.7-2.7,154-221,154-221
5,london,6,69-75,69-75,0.7-2.9,0.7-2.9,149-219,149-218
6,london,7,74-80,74-80,0.9-2.4,0.9-2.4,179-240,179-240
7,london,8,72-79,72-79,1.6-3.4,1.6-3.4,153-217,153-217
8,london,9,66-71,66-71,0.7-2.5,0.7-2.5,133-190,133-191
9,london,10,58-64,58-64,1.5-3.8,1.5-3.8,88-128,88-128


In [31]:
# Dropping the London 2023 columns since they're nearly identical to the 2022 columns
london_predictions_df = london_predictions_df.drop(columns=['Predicted avg high temp 2023 (F)', 
                                                            'Predicted total rainfall 2023 (inches)',
                                                            'Predicted total sunshine 2023 (hours)'])

london_predictions_df

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted total rainfall 2022 (inches),Predicted total sunshine 2022 (hours)
0,london,1,46-52,1.6-3.3,46-75
1,london,2,45-53,0.9-2.6,65-107
2,london,3,52-58,0.8-2.4,98-148
3,london,4,58-62,0.5-2.3,150-210
4,london,5,63-69,0.7-2.7,154-221
5,london,6,69-75,0.7-2.9,149-219
6,london,7,74-80,0.9-2.4,179-240
7,london,8,72-79,1.6-3.4,153-217
8,london,9,66-71,0.7-2.5,133-190
9,london,10,58-64,1.5-3.8,88-128


In [32]:
# Renaming the columns
london_predictions_df.rename(columns={'Predicted avg high temp 2022 (F)': 'Predicted avg high temp 2022-23 (F)',
                                     'Predicted total rainfall 2022 (inches)': 'Predicted total rainfall 2022-23 (inches)',
                                     'Predicted total sunshine 2022 (hours)': 'Predicted total rainfall 2022-23 (hours)'})

london_predictions_df

Unnamed: 0,city,month_num,Predicted avg high temp 2022 (F),Predicted total rainfall 2022 (inches),Predicted total sunshine 2022 (hours)
0,london,1,46-52,1.6-3.3,46-75
1,london,2,45-53,0.9-2.6,65-107
2,london,3,52-58,0.8-2.4,98-148
3,london,4,58-62,0.5-2.3,150-210
4,london,5,63-69,0.7-2.7,154-221
5,london,6,69-75,0.7-2.9,149-219
6,london,7,74-80,0.9-2.4,179-240
7,london,8,72-79,1.6-3.4,153-217
8,london,9,66-71,0.7-2.5,133-190
9,london,10,58-64,1.5-3.8,88-128


In [33]:
# Adding month name to the DataFrame
london_predictions_df['Month'] = london_predictions_df['month_num'].apply(lambda x:month_dict[x])

# Dropping the unneeded columns
london_predictions_df.drop(columns=['city', 'month_num'], axis=1)

# Rearranging the columns
london_predictions_df = london_predictions_df[['Month', 'Predicted avg high temp 2022 (F)', 
                                             'Predicted total rainfall 2022 (inches)',
                                             'Predicted total sunshine 2022 (hours)']]

# Displaying the updated DataFrame
london_predictions_df

Unnamed: 0,Month,Predicted avg high temp 2022 (F),Predicted total rainfall 2022 (inches),Predicted total sunshine 2022 (hours)
0,Jan,46-52,1.6-3.3,46-75
1,Feb,45-53,0.9-2.6,65-107
2,Mar,52-58,0.8-2.4,98-148
3,Apr,58-62,0.5-2.3,150-210
4,May,63-69,0.7-2.7,154-221
5,Jun,69-75,0.7-2.9,149-219
6,Jul,74-80,0.9-2.4,179-240
7,Aug,72-79,1.6-3.4,153-217
8,Sep,66-71,0.7-2.5,133-190
9,Oct,58-64,1.5-3.8,88-128


In [34]:
# Saving the Predictions DataFrames as CSVs
nyc_predictions_df.to_csv('Output/nyc_predictions.csv', index=False)
london_predictions_df.to_csv('Output/london_predictions.csv', index=False)