In [1]:
# Importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from getpass import getpass
import numpy as np

In [2]:
# Loading the weather data from the databases
# Reading the data from our database
# Debugging with syntax from here:  https://stackoverflow.com/questions/23839656/sqlalchemy-no-password-supplied-error
password = getpass('Enter database password')
london_weather_df = pd.read_sql_table('london_weather_yyyy_mm', 
                                      f'postgresql://postgres:{password}@localhost/Final_Project_Travel')
nyc_weather_df = pd.read_sql_table('nyc_weather_yyyy_mm',
                                  f'postgresql://postgres:{password}@localhost/Final_Project_Travel')

Enter database password········


In [3]:
# Looking at the London weather
london_weather_df

Unnamed: 0,index,year,month_num,avg_high_temp_f,avg_low_temp_f,total_rainfall_inches,days_of_air_frost,total_sunshine_duration_hours
0,108,1957,1,47.66,36.86,1.555118,5,53.0
1,109,1957,2,48.20,37.22,2.748031,5,64.9
2,110,1957,3,57.02,42.26,1.000000,2,96.7
3,111,1957,4,57.56,41.36,0.224409,1,169.6
4,112,1957,5,61.16,43.70,0.838583,0,195.0
...,...,...,...,...,...,...,...,...
771,879,2021,4,55.58,37.22,0.283465,5,202.6
772,880,2021,5,61.70,44.96,3.330709,0,131.9
773,881,2021,6,72.50,55.94,3.472441,0,159.6
774,882,2021,7,75.56,58.82,2.409449,0,171.1


In [4]:
# Looking at the NYC weather
nyc_weather_df

Unnamed: 0,index,year,month_num,avg_high_temp_f,avg_low_temp_f,avg_total_precipitation_inches,avg_total_snowfall_inches
0,0,1948,1,31.2,19.6,4.74,15.3
1,1,1948,2,37.8,23.6,2.52,13.6
2,2,1948,3,50.6,33.5,3.51,4.8
3,3,1948,4,58.9,43.1,3.26,0.0
4,4,1948,5,67.6,52.8,7.58,0.0
...,...,...,...,...,...,...,...
879,879,2021,4,63.7,45.5,2.69,0.0
880,880,2021,5,71.7,54.0,4.36,0.0
881,881,2021,6,82.5,66.0,2.62,0.0
882,882,2021,7,83.0,69.0,11.09,0.0


In [5]:
# Dropping the index columns
london_weather_df = london_weather_df.drop(columns=['index'], axis=1)
nyc_weather_df = nyc_weather_df.drop(columns=['index'], axis=1)

In [6]:
# Checking for null values
print(london_weather_df.isnull().sum())
print(nyc_weather_df.isnull().sum())

year                             0
month_num                        0
avg_high_temp_f                  0
avg_low_temp_f                   0
total_rainfall_inches            0
days_of_air_frost                0
total_sunshine_duration_hours    0
dtype: int64
year                              0
month_num                         0
avg_high_temp_f                   0
avg_low_temp_f                    0
avg_total_precipitation_inches    0
avg_total_snowfall_inches         0
dtype: int64


In [7]:
# Checking our datatypes
print(london_weather_df.dtypes)
print(nyc_weather_df.dtypes)

year                               int64
month_num                          int64
avg_high_temp_f                  float64
avg_low_temp_f                   float64
total_rainfall_inches            float64
days_of_air_frost                  int64
total_sunshine_duration_hours    float64
dtype: object
year                                int64
month_num                           int64
avg_high_temp_f                   float64
avg_low_temp_f                    float64
avg_total_precipitation_inches    float64
avg_total_snowfall_inches         float64
dtype: object


# Linear regression model avg high temp vs. year

In [8]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
high_temp_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        y = city_month_weather_df.avg_high_temp_f

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

        # Plotting and saving the predictions vs. the datapoints
        plt.scatter(X_train, y_train, color='blue', label='train')
        plt.scatter(X_test, y_test, color='green', label='test')
        plt.plot(X_test, y_pred, color='red', label='predict')
        plt.xlabel('year')
        plt.ylabel('avg high temp (F)')
        plt.legend()
        if city == 'london':
            plt.title('London ' + str(i) + ' avg high temps vs. model')
            plt.savefig('images/high_temp/london_'+ str(i) + '_avg_high_temps.png')
        else:
            plt.title('NYC ' + str(i) + ' avg high temps vs. model')
            plt.savefig('images/high_temp/nyc_'+ str(i) + '_avg_high_temps.png')
        # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
        plt.clf()

        # Viewing our coefficient and intercept
        #print(model.coef_)
        #print(model.intercept_)
        #print(f'{city} month = {i}')
        #print('---')

        # Adding the values to the list
        high_temp_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'high_temp',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'rmse': rmse
        })

# Creating a DataFrame from our results
high_temp_model_df = pd.DataFrame(high_temp_model_outputs)

# Adding the predictions to the high temp DataFrame
high_temp_model_df['2022_prediction_F'] = (2022 * high_temp_model_df['coef']) + high_temp_model_df['intercept']
high_temp_model_df['2023_prediction_F'] = (2023 * high_temp_model_df['coef']) + high_temp_model_df['intercept']

# Formatting the columns
high_temp_model_df['coef'] = high_temp_model_df['coef'].map('{:.2f}'.format)
high_temp_model_df['intercept'] = high_temp_model_df['intercept'].map('{:.2f}'.format)
high_temp_model_df['mae'] = high_temp_model_df['mae'].map('{:.2f}'.format)
high_temp_model_df['mse'] = high_temp_model_df['mse'].map('{:.2f}'.format)
high_temp_model_df['rmse'] = high_temp_model_df['rmse'].map('{:.2f}'.format)
high_temp_model_df['2022_prediction_F'] = high_temp_model_df['2022_prediction_F'].map('{:.2f}'.format)
high_temp_model_df['2023_prediction_F'] = high_temp_model_df['2023_prediction_F'].map('{:.2f}'.format)

# Displaying the updated DataFrame
high_temp_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_F,2023_prediction_F
0,london,1,high_temp,0.05,-45.45,2.95,17.36,4.17,47.89,47.93
1,london,2,high_temp,0.04,-37.41,3.33,21.09,4.59,48.73,48.77
2,london,3,high_temp,0.05,-48.99,1.93,6.47,2.54,53.6,53.65
3,london,4,high_temp,0.06,-63.63,2.46,11.04,3.32,59.18,59.24
4,london,5,high_temp,0.06,-54.57,1.81,6.94,2.63,65.55,65.61
5,london,6,high_temp,0.04,-18.11,2.15,6.59,2.57,71.57,71.62
6,london,7,high_temp,0.09,-96.21,2.87,12.66,3.56,76.45,76.53
7,london,8,high_temp,0.07,-73.55,2.48,10.33,3.21,75.04,75.12
8,london,9,high_temp,0.04,-15.73,1.96,6.0,2.45,69.02,69.06
9,london,10,high_temp,0.02,10.9,1.94,5.65,2.38,60.95,60.97


<Figure size 432x288 with 0 Axes>

# Linear regression model total precip/rainfall vs. year

In [9]:
# Looping through the months for both cities, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating cities list
cities = ['london', 'nyc']

# Creating empty list to hold the outputs of our model
total_rainfall_model_outputs = []

# Looping through each city
for city in cities:
    # Iterating through the months
    for i in range(1,13):
        # Splitting out the weather for that city and that month
        if city == 'london':
            city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]
        else:
            city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

        # Preparing the data for Scikit-learn library
        X = city_month_weather_df.year.values.reshape(-1,1)

        # Assigning the target variable
        if city == 'london':
            y = city_month_weather_df.total_rainfall_inches
        else:
            y = city_month_weather_df.avg_total_precipitation_inches

        # Creating the model from the class
        model = LinearRegression()
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        # Training the model
        model.fit(X_train, y_train)

        # Generating the predictions
        y_pred = model.predict(X_test)
        
        # Evaluating the performance
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

        # Plotting and saving the predictions vs. the datapoints
        plt.scatter(X_train, y_train, color='blue', label='train')
        plt.scatter(X_test, y_test, color='green', label='test')
        plt.plot(X_test, y_pred, color='red', label='predict')
        plt.xlabel('year')
        plt.ylabel('total rainfall (inches)')
        plt.legend()
        if city == 'london':
            plt.title('London ' + str(i) + ' total rainfall vs. model')
            plt.savefig('images/rainfall/london_'+ str(i) + '_total_rainfall.png')
        else:
            plt.title('NYC ' + str(i) + ' total rainfall vs. model')
            plt.savefig('images/rainfall/nyc_'+ str(i) + '_total_rainfall.png')
        # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
        plt.clf()

        # Viewing our coefficient and intercept
        #print(model.coef_)
        #print(model.intercept_)
        #print(f'{city} month = {i}')
        #print('---')

        # Adding the values to the list
        total_rainfall_model_outputs.append({
            'city': city,
            'month_num': i,
            'weather_factor': 'rainfall',
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'mae': mae,
            'mse': mse,
            'rmse': rmse
        })
        
# Creating a DataFrame from our results
total_rainfall_model_df = pd.DataFrame(total_rainfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_rainfall_model_df['2022_prediction_inches'] = (2022 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']
total_rainfall_model_df['2023_prediction_inches'] = (2023 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']

# Formatting the columns
total_rainfall_model_df['coef'] = total_rainfall_model_df['coef'].map('{:.2f}'.format)
total_rainfall_model_df['intercept'] = total_rainfall_model_df['intercept'].map('{:.2f}'.format)
total_rainfall_model_df['mae'] = total_rainfall_model_df['mae'].map('{:.2f}'.format)
total_rainfall_model_df['mse'] = total_rainfall_model_df['mse'].map('{:.2f}'.format)
total_rainfall_model_df['rmse'] = total_rainfall_model_df['rmse'].map('{:.2f}'.format)
total_rainfall_model_df['2022_prediction_inches'] = total_rainfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
total_rainfall_model_df['2023_prediction_inches'] = total_rainfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_rainfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_inches,2023_prediction_inches
0,london,1,rainfall,0.02,-28.28,0.75,0.76,0.87,2.68,2.7
1,london,2,rainfall,0.01,-18.5,0.71,0.67,0.82,2.03,2.04
2,london,3,rainfall,0.0,0.01,0.79,1.05,1.03,1.58,1.58
3,london,4,rainfall,0.0,-8.14,1.04,1.38,1.18,1.79,1.79
4,london,5,rainfall,0.01,-9.83,1.01,1.44,1.2,2.18,2.19
5,london,6,rainfall,-0.0,3.49,1.19,1.85,1.36,1.99,1.99
6,london,7,rainfall,-0.01,27.76,0.96,1.36,1.16,1.5,1.48
7,london,8,rainfall,0.0,-7.43,0.99,1.76,1.33,2.11,2.11
8,london,9,rainfall,-0.01,23.24,1.18,1.89,1.37,1.61,1.6
9,london,10,rainfall,0.01,-21.3,1.07,2.15,1.47,2.72,2.73


<Figure size 432x288 with 0 Axes>

# Linear regression model NYC snowfall vs. Year

In [10]:
# Looping through the months for NYC, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_snowfall_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for NYC for each month
    city_month_weather_df = nyc_weather_df.loc[(nyc_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.avg_total_snowfall_inches

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    # Plotting and saving the predictions vs. the datapoints
    plt.scatter(X_train, y_train, color='blue', label='train')
    plt.scatter(X_test, y_test, color='green', label='test')
    plt.plot(X_test, y_pred, color='red', label='predict')
    plt.xlabel('year')
    plt.ylabel('total snowfall (inches)')
    plt.legend()
    plt.title('NYC ' + str(i) + ' total snowfall vs. model')
    plt.savefig('images/snowfall/nyc_'+ str(i) + '_total_snowfall.png')
    # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
    plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    total_snowfall_model_outputs.append({
        'city': 'NYC',
        'month_num': i,
        'weather_factor': 'snowfall',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
total_snowfall_model_df = pd.DataFrame(total_snowfall_model_outputs)

# Adding the predictions to the high temp DataFrame
total_snowfall_model_df['2022_prediction_inches'] = (2022 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']
total_snowfall_model_df['2023_prediction_inches'] = (2023 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']

# Formatting the columns
total_snowfall_model_df['coef'] = total_snowfall_model_df['coef'].map('{:.2f}'.format)
total_snowfall_model_df['intercept'] = total_snowfall_model_df['intercept'].map('{:.2f}'.format)
total_snowfall_model_df['mae'] = total_snowfall_model_df['mae'].map('{:.2f}'.format)
total_snowfall_model_df['mse'] = total_snowfall_model_df['mse'].map('{:.2f}'.format)
total_snowfall_model_df['rmse'] = total_snowfall_model_df['rmse'].map('{:.2f}'.format)
total_snowfall_model_df['2022_prediction_inches'] = total_snowfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
total_snowfall_model_df['2023_prediction_inches'] = total_snowfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_snowfall_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_inches,2023_prediction_inches
0,NYC,1,snowfall,0.07,-133.11,5.36,37.39,6.11,11.25,11.32
1,NYC,2,snowfall,0.1,-182.49,6.05,47.41,6.89,12.9,12.99
2,NYC,3,snowfall,-0.01,30.41,3.57,15.14,3.89,4.81,4.79
3,NYC,4,snowfall,-0.0,0.41,1.27,7.3,2.7,0.19,0.19
4,NYC,5,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,NYC,6,snowfall,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,NYC,7,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NYC,8,snowfall,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,NYC,9,snowfall,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,NYC,10,snowfall,-0.0,0.0,0.18,0.46,0.68,0.0,0.0


<Figure size 432x288 with 0 Axes>

# Linear regression model London sunshine hours vs. Year

In [11]:
# Looping through the months for London, splitting into training and testing data, and evaluating each model
# Learned about evaluating and syntax on evaluating linear regression from here:  https://towardsdatascience.com/linear-regression-in-python-9a1f5f000606

# Creating empty list to hold the outputs of our model
total_sunshine_model_outputs = []

# Iterating through the months
for i in range(1,13):
    # Splitting out the weather for London for each month
    city_month_weather_df = london_weather_df.loc[(london_weather_df['month_num'] == i)]

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df.year.values.reshape(-1,1)

    # Assigning the target variable
    y = city_month_weather_df.total_sunshine_duration_hours

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

    # Plotting and saving the predictions vs. the datapoints
    plt.scatter(X_train, y_train, color='blue', label='train')
    plt.scatter(X_test, y_test, color='green', label='test')
    plt.plot(X_test, y_pred, color='red', label='predict')
    plt.xlabel('year')
    plt.ylabel('total sunshine (hours)')
    plt.legend()
    plt.title('London ' + str(i) + ' total sunshine vs. model')
    plt.savefig('images/sunshine/london_'+ str(i) + '_total_sunshine.png')
    # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
    plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    total_sunshine_model_outputs.append({
        'city': 'London',
        'month_num': i,
        'weather_factor': 'sunshine',
        'coef': model.coef_[0],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
total_sunshine_model_df = pd.DataFrame(total_sunshine_model_outputs)

# Adding the predictions to the high temp DataFrame
total_sunshine_model_df['2022_prediction_hours'] = (2022 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']
total_sunshine_model_df['2023_prediction_hours'] = (2023 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']

# Formatting the columns
total_sunshine_model_df['coef'] = total_sunshine_model_df['coef'].map('{:.2f}'.format)
total_sunshine_model_df['intercept'] = total_sunshine_model_df['intercept'].map('{:.2f}'.format)
total_sunshine_model_df['mae'] = total_sunshine_model_df['mae'].map('{:.2f}'.format)
total_sunshine_model_df['mse'] = total_sunshine_model_df['mse'].map('{:.2f}'.format)
total_sunshine_model_df['rmse'] = total_sunshine_model_df['rmse'].map('{:.2f}'.format)
total_sunshine_model_df['2022_prediction_hours'] = total_sunshine_model_df['2022_prediction_hours'].map('{:.2f}'.format)
total_sunshine_model_df['2023_prediction_hours'] = total_sunshine_model_df['2023_prediction_hours'].map('{:.2f}'.format)

# Displaying the updated DataFrame
total_sunshine_model_df

Unnamed: 0,city,month_num,weather_factor,coef,intercept,mae,mse,rmse,2022_prediction_hours,2023_prediction_hours
0,London,1,sunshine,0.09,-117.76,10.71,214.42,14.64,57.48,57.57
1,London,2,sunshine,0.31,-542.69,18.84,531.27,23.05,78.83,79.14
2,London,3,sunshine,0.38,-638.7,27.61,997.58,31.58,127.66,128.04
3,London,4,sunshine,0.68,-1189.5,32.58,1639.42,40.49,180.51,181.19
4,London,5,sunshine,0.16,-130.76,32.54,1714.57,41.41,199.62,199.78
5,London,6,sunshine,-0.32,832.45,39.32,2018.33,44.93,190.8,190.48
6,London,7,sunshine,0.27,-339.66,30.29,1341.04,36.62,205.54,205.81
7,London,8,sunshine,0.2,-214.87,29.7,1625.2,40.31,198.73,198.93
8,London,9,sunshine,-0.22,579.73,28.27,1186.05,34.44,137.76,137.54
9,London,10,sunshine,0.11,-104.89,23.89,923.04,30.38,109.57,109.68


<Figure size 432x288 with 0 Axes>

In [12]:
# One high_temp model for each city with all months

# Creating empty list to hold the outputs of our model
combined_high_temp_model_outputs = []

# Looping through each city
for city in cities:
    if city == 'london':
        city_month_weather_df = london_weather_df
    else:
        city_month_weather_df = nyc_weather_df
        
    # Preparing the data for Scikit-learn library
    X = city_month_weather_df[['year', 'month_num']]

    # Assigning the target variable
    y = city_month_weather_df.avg_high_temp_f

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#     # Plotting and saving the predictions vs. the datapoints
#     plt.scatter(X_train, y_train, color='blue', label='train')
#     plt.scatter(X_test, y_test, color='green', label='test')
#     plt.plot(X_test, y_pred, color='red', label='predict')
#     plt.xlabel('year')
#     plt.ylabel('avg high temp (F)')
#     plt.legend()
#     if city == 'london':
#         plt.title('London ' + str(i) + ' avg high temps vs. model')
#         plt.savefig('images/high_temp/london_'+ str(i) + '_avg_high_temps.png')
#     else:
#         plt.title('NYC ' + str(i) + ' avg high temps vs. model')
#         plt.savefig('images/high_temp/nyc_'+ str(i) + '_avg_high_temps.png')
#     # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
#     plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    combined_high_temp_model_outputs.append({
        'city': city,
        #'month_num': i,
        'weather_factor': 'high_temp',
        'coef_year': model.coef_[0],
        'coef_month_num': model.coef_[1],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })

# Creating a DataFrame from our results
combined_high_temp_model_df = pd.DataFrame(combined_high_temp_model_outputs)

# Adding the predictions to the high temp DataFrame
#combined_high_temp_model_df['2022_prediction_F'] = (2022 * combined_high_temp_model_df['coef_year']) + (combined_high_temp_model_df['month_num'] * combined_high_temp_model_df['coef_month_num']) + combined_high_temp_model_df['intercept']
#combined_high_temp_model_df['2023_prediction_F'] = (2023 * combined_high_temp_model_df['coef_year']) + (combined_high_temp_model_df['month_num'] * combined_high_temp_model_df['coef_month_num']) + combined_high_temp_model_df['intercept']

# Formatting the columns
combined_high_temp_model_df['coef_year'] = combined_high_temp_model_df['coef_year'].map('{:.2f}'.format)
combined_high_temp_model_df['coef_month_num'] = combined_high_temp_model_df['coef_month_num'].map('{:.2f}'.format)                                                                                                        
combined_high_temp_model_df['intercept'] = combined_high_temp_model_df['intercept'].map('{:.2f}'.format)
combined_high_temp_model_df['mae'] = combined_high_temp_model_df['mae'].map('{:.2f}'.format)
combined_high_temp_model_df['mse'] = combined_high_temp_model_df['mse'].map('{:.2f}'.format)
combined_high_temp_model_df['rmse'] = combined_high_temp_model_df['rmse'].map('{:.2f}'.format)
#combined_high_temp_model_df['2022_prediction_F'] = combined_high_temp_model_df['2022_prediction_F'].map('{:.2f}'.format)
#combined_high_temp_model_df['2023_prediction_F'] = combined_high_temp_model_df['2023_prediction_F'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_high_temp_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,london,high_temp,0.05,0.69,-46.42,8.41,99.18,9.96
1,nyc,high_temp,0.0,1.16,46.07,13.6,236.51,15.38


In [13]:
# One rainfall model for each city for all months
# Looks like this model performs slightly better or about the same overall vs. the model above where rainfall data is broken out by months

# Creating empty list to hold the outputs of our model
combined_total_rainfall_model_outputs = []

# Looping through each city
for city in cities:
    # Splitting out the weather for that city
    if city == 'london':
        city_month_weather_df = london_weather_df
    else:
        city_month_weather_df = nyc_weather_df

    # Preparing the data for Scikit-learn library
    X = city_month_weather_df[['year', 'month_num']]

    # Assigning the target variable
    if city == 'london':
        y = city_month_weather_df.total_rainfall_inches
    else:
        y = city_month_weather_df.avg_total_precipitation_inches

    # Creating the model from the class
    model = LinearRegression()

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # Training the model
    model.fit(X_train, y_train)

    # Generating the predictions
    y_pred = model.predict(X_test)

    # Evaluating the performance
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

#     # Plotting and saving the predictions vs. the datapoints
#     plt.scatter(X_train, y_train, color='blue', label='train')
#     plt.scatter(X_test, y_test, color='green', label='test')
#     plt.plot(X_test, y_pred, color='red', label='predict')
#     plt.xlabel('year')
#     plt.ylabel('total rainfall (inches)')
#     plt.legend()
#     if city == 'london':
#         plt.title('London ' + str(i) + ' total rainfall vs. model')
#         plt.savefig('images/rainfall/london_'+ str(i) + '_total_rainfall.png')
#     else:
#         plt.title('NYC ' + str(i) + ' total rainfall vs. model')
#         plt.savefig('images/rainfall/nyc_'+ str(i) + '_total_rainfall.png')
#     # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
#     plt.clf()

    # Viewing our coefficient and intercept
    #print(model.coef_)
    #print(model.intercept_)
    #print(f'{city} month = {i}')
    #print('---')

    # Adding the values to the list
    combined_total_rainfall_model_outputs.append({
        'city': city,
        #'month_num': i,
        'weather_factor': 'rainfall',
        'coef_year': model.coef_[0],
        'coef_month_num': model.coef_[1],
        'intercept': model.intercept_,
        'mae': mae,
        'mse': mse,
        'rmse': rmse
    })
        
# Creating a DataFrame from our results
combined_total_rainfall_model_df = pd.DataFrame(combined_total_rainfall_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_rainfall_model_df['2022_prediction_inches'] = (2022 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']
#total_rainfall_model_df['2023_prediction_inches'] = (2023 * total_rainfall_model_df['coef']) + total_rainfall_model_df['intercept']

# Formatting the columns
combined_total_rainfall_model_df['coef_year'] = combined_total_rainfall_model_df['coef_year'].map('{:.2f}'.format)
combined_total_rainfall_model_df['coef_month_num'] = combined_total_rainfall_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_rainfall_model_df['intercept'] = combined_total_rainfall_model_df['intercept'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['mae'] = combined_total_rainfall_model_df['mae'].map('{:.2f}'.format)
combined_total_rainfall_model_df['mse'] = combined_total_rainfall_model_df['mse'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['rmse'] = combined_total_rainfall_model_df['rmse'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['2022_prediction_inches'] = combined_total_rainfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_rainfall_model_df['2023_prediction_inches'] = combined_total_rainfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_rainfall_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,london,rainfall,0.0,0.05,-2.71,0.932439,1.43,1.195357
1,nyc,rainfall,0.01,0.03,-25.8,1.592751,4.72,2.172619


In [14]:
# One model for all months snowfall NYC
# Model above with one for each month performs better

# Creating empty list to hold the outputs of our model
combined_total_snowfall_model_outputs = []

# Splitting out the weather for NYC for each month
city_month_weather_df = nyc_weather_df

# Preparing the data for Scikit-learn library
X = city_month_weather_df[['year', 'month_num']]

# Assigning the target variable
y = city_month_weather_df.avg_total_snowfall_inches

# Creating the model from the class
model = LinearRegression()

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training the model
model.fit(X_train, y_train)

# Generating the predictions
y_pred = model.predict(X_test)

# Evaluating the performance
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# # Plotting and saving the predictions vs. the datapoints
# plt.scatter(X_train, y_train, color='blue', label='train')
# plt.scatter(X_test, y_test, color='green', label='test')
# plt.plot(X_test, y_pred, color='red', label='predict')
# plt.xlabel('year')
# plt.ylabel('total snowfall (inches)')
# plt.legend()
# plt.title('NYC ' + str(i) + ' total snowfall vs. model')
# plt.savefig('images/snowfall/nyc_'+ str(i) + '_total_snowfall.png')
# # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
# plt.clf()

# Viewing our coefficient and intercept
#print(model.coef_)
#print(model.intercept_)
#print(f'{city} month = {i}')
#print('---')

# Adding the values to the list
combined_total_snowfall_model_outputs.append({
    'city': 'NYC',
    #'month_num': i,
    'weather_factor': 'snowfall',
    'coef_year': model.coef_[0],
    'coef_month_num': model.coef_[1],
    'intercept': model.intercept_,
    'mae': mae,
    'mse': mse,
    'rmse': rmse
})

# Creating a DataFrame from our results
combined_total_snowfall_model_df = pd.DataFrame(combined_total_snowfall_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_snowfall_model_df['2022_prediction_inches'] = (2022 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']
#total_snowfall_model_df['2023_prediction_inches'] = (2023 * total_snowfall_model_df['coef']) + total_snowfall_model_df['intercept']

# Formatting the columns
combined_total_snowfall_model_df['coef_year'] = combined_total_snowfall_model_df['coef_year'].map('{:.2f}'.format)
combined_total_snowfall_model_df['coef_month_num'] = combined_total_snowfall_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_snowfall_model_df['intercept'] = combined_total_snowfall_model_df['intercept'].map('{:.2f}'.format)
combined_total_snowfall_model_df['mae'] = combined_total_snowfall_model_df['mae'].map('{:.2f}'.format)
combined_total_snowfall_model_df['mse'] = combined_total_snowfall_model_df['mse'].map('{:.2f}'.format)
combined_total_snowfall_model_df['rmse'] = combined_total_snowfall_model_df['rmse'].map('{:.2f}'.format)
#combined_total_snowfall_model_df['2022_prediction_inches'] = combined_total_snowfall_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_snowfall_model_df['2023_prediction_inches'] = combined_total_snowfall_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_snowfall_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,NYC,snowfall,0.01,-0.53,-17.42,3.06,26.02,5.1


In [15]:
# One model for London sunshine data for all months
# Model above broken out by months performs better than combined model

# Creating empty list to hold the outputs of our model
combined_total_sunshine_model_outputs = []

city_month_weather_df = london_weather_df

# Preparing the data for Scikit-learn library
X = city_month_weather_df[['year', 'month_num']]

# Assigning the target variable
y = city_month_weather_df.total_sunshine_duration_hours

# Creating the model from the class
model = LinearRegression()

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Training the model
model.fit(X_train, y_train)

# Generating the predictions
y_pred = model.predict(X_test)

# Evaluating the performance
mae = metrics.mean_absolute_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# # Plotting and saving the predictions vs. the datapoints
# plt.scatter(X_train, y_train, color='blue', label='train')
# plt.scatter(X_test, y_test, color='green', label='test')
# plt.plot(X_test, y_pred, color='red', label='predict')
# plt.xlabel('year')
# plt.ylabel('total sunshine (hours)')
# plt.legend()
# plt.title('London ' + str(i) + ' total sunshine vs. model')
# plt.savefig('images/sunshine/london_'+ str(i) + '_total_sunshine.png')
# # Clearing figure after saving with syntax found here:  https://www.tutorialspoint.com/how-to-clear-the-memory-completely-of-all-matplotlib-plots
# plt.clf()

# Viewing our coefficient and intercept
#print(model.coef_)
#print(model.intercept_)
#print(f'{city} month = {i}')
#print('---')

# Adding the values to the list
combined_total_sunshine_model_outputs.append({
    'city': 'London',
    #'month_num': i,
    'weather_factor': 'sunshine',
    'coef_year': model.coef_[0],
    'coef_month_num': model.coef_[1],
    'intercept': model.intercept_,
    'mae': mae,
    'mse': mse,
    'rmse': rmse
})

# Creating a DataFrame from our results
combined_total_sunshine_model_df = pd.DataFrame(combined_total_sunshine_model_outputs)

# Adding the predictions to the high temp DataFrame
#total_sunshine_model_df['2022_prediction_inches'] = (2022 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']
#total_sunshine_model_df['2023_prediction_inches'] = (2023 * total_sunshine_model_df['coef']) + total_sunshine_model_df['intercept']

# Formatting the columns
combined_total_sunshine_model_df['coef_year'] = combined_total_sunshine_model_df['coef_year'].map('{:.2f}'.format)
combined_total_sunshine_model_df['coef_month_num'] = combined_total_sunshine_model_df['coef_month_num'].map('{:.2f}'.format)
combined_total_sunshine_model_df['intercept'] = combined_total_sunshine_model_df['intercept'].map('{:.2f}'.format)
combined_total_sunshine_model_df['mae'] = combined_total_sunshine_model_df['mae'].map('{:.2f}'.format)
combined_total_sunshine_model_df['mse'] = combined_total_sunshine_model_df['mse'].map('{:.2f}'.format)
combined_total_sunshine_model_df['rmse'] = combined_total_sunshine_model_df['rmse'].map('{:.2f}'.format)
#combined_total_sunshine_model_df['2022_prediction_inches'] = combined_total_sunshine_model_df['2022_prediction_inches'].map('{:.2f}'.format)
#combined_total_sunshine_model_df['2023_prediction_inches'] = combined_total_sunshine_model_df['2023_prediction_inches'].map('{:.2f}'.format)

# Displaying the updated DataFrame
combined_total_sunshine_model_df

Unnamed: 0,city,weather_factor,coef_year,coef_month_num,intercept,mae,mse,rmse
0,London,sunshine,0.21,-1.03,-286.96,52.27,3788.95,61.55
