# Exercise 2.7 of Microsoft Learn ML course
Modification compared to original
* scaled data
* two different cost functions - MSD (Mean Square Differenace) and MAD (Mean Absolute Difference)

In [1]:
from datetime import datetime
import pandas
import graphing # Custom graphing code. See our GitHub repository

# Load a file that contains weather data for Seattle
data = pandas.read_csv('seattleWeather_1948-2017.csv', parse_dates=['date'])

# Remove all dates after July 1 because we have to to plant onions before summer begins
data = data[[d.month < 7 for d in data.date]].copy()


# Convert the dates into numbers so we can use them in our models
# We make a year column that can contain fractions. For example,
# 1948.5 is halfway through the year 1948
data["year"] = [(d.year + d.timetuple().tm_yday / 365.25) for d in data.date]

# Let's take a quick look at our data
print("Visual Check:")
graphing.scatter_2D(data, 
                    label_x="year", 
                    label_y="min_temperature",
                    title="Temperatures over time (°F)")

Visual Check:


### Scaling data

In [2]:
data["year_scaled"] = (data["year"] - (data["year"].max() + data["year"].min())/2)/(data["year"].max() - data["year"].min())
data["min_temp_scaled"] = (data["min_temperature"] - (data["min_temperature"].max() + data["min_temperature"].min())/2)/(data["min_temperature"].max() - data["min_temperature"].min()) 
graphing.scatter_2D(data, label_x= "year_scaled", label_y= "min_temp_scaled")

### Check for MSD (aka OLS)

In [3]:
import statsmodels.formula.api as smf

# Perform linear regression to fit a line to our data
# NB OLS uses the sum or mean of squared differences as a cost function,
# which we're familiar with from our last exercise 
model = smf.ols(formula = "min_temp_scaled ~ year_scaled", data = data).fit()

# Print the model
intercept = model.params[0]
slope = model.params[1]

print(f"The model is: y = {slope:0.5f} * X + {intercept:0.5f}")

The model is: y = 0.06528 * X + 0.12273


### Define model

In [4]:
class MyModel:

    def __init__(self):
        '''
        Creates a new MyModel
        '''
        # Straight lines described by two parameters:
        # The slope is the angle of the line
        self.slope = 0
        # The intercept moves the line up or down
        self.intercept = 0

    def predict(self, date):
        '''
        Estimates the temperature from the date
        '''
        return date * self.slope + self.intercept

    def get_summary(self):
        '''
        Returns a string that summarises the model
        '''
        return f"y = {self.slope} * x + {self.intercept}"

print("Model class ready")

Model class ready


### Implement gradient for MSD and MAD

In [5]:
import numpy as np

x = data["year_scaled"]
temperature_true = data["min_temp_scaled"]

def calculate_gradient_MAD(temperature_estimate):
    """
    This calculates the gradient for a linear regession 
    by using the Mean Absolute Difference cost function
    """

    # The partial derivatives of MAD are as follows
    # You don't need to be able to do this just yet, but
    # it's important to note that these give you the two gradients
    # that we need to train our model
    error = temperature_estimate - temperature_true
    grad_intercept = np.mean(np.sign(error))
    grad_slope = (x * np.sign(error)).mean()

    return grad_intercept, grad_slope

def calculate_gradient_SSD(temperature_estimate):
    """
    This calculates the gradient for a linear regession 
    by using the Mean Squared Error cost function
    """

    # The partial derivatives of MSE are as follows
    # You don't need to be able to do this just yet, but
    # it's important to note that these give you the two gradients
    # that we need to train our model
    error = temperature_estimate - temperature_true
    grad_intercept = np.mean(error) * 2
    grad_slope = (x * error).mean() * 2

    return grad_intercept, grad_slope

# Set a cost function. This will be the mean of absolute differences
def cost_function_MAD(temperature_estimate):
    """
    Calculates cost for a given temperature estimate
    Our cost function is the mean of absolute differences
    """
    # Note that with NumPy to take absolute value, we use np.abs
    return np.mean(np.abs(temperature_true - temperature_estimate))

# Set a cost function. This will be the mean square differences
def cost_function_MSD(temperature_estimate):
    """
    Calculates cost for a given temperature estimate
    Our cost function is the mean of square differences
    """
    # Note that with NumPy to take absolute value, we use np.abs
    return np.mean((temperature_true - temperature_estimate)**2)
print("Function is ready!")

Function is ready!


## Graph of cost function

In [19]:
def predict_and_calc_cost_MAD(intercept, slope):
    '''
    Uses the model to make a prediction, then calculates the cost 
    '''

    # Predict temperature by using these model parameters
    temperature_estimate = x * slope + intercept

    # Calculate cost
    return cost_function_MAD(temperature_estimate)

def predict_and_calc_cost_MSD(intercept, slope):
    '''
    Uses the model to make a prediction, then calculates the cost 
    '''

    # Predict temperature by using these model parameters
    temperature_estimate = x * slope + intercept

    # Calculate cost
    return cost_function_MSD(temperature_estimate)


In [23]:
# Choose a range of intercepts and slopes values
intercepts = np.linspace(-1,1,10)
slopes = np.linspace(-1,1,10)
# Call the graphing method. This will use our cost function,
# which is above. If you want to view this code in detail,
# then see this project's GitHub repository

graphing.surface(x_values=intercepts, 
                y_values=slopes, 
                calc_z=predict_and_calc_cost_MSD, 
                title="Cost for Different Model Parameters",
                axis_title_x="Model intercept",
                axis_title_y="Model slope",
                axis_title_z="Cost")

## Perform gradient descent for MAD and MSD

### Define functions

In [6]:
def gradient_descent_MAD(learning_rate, number_of_iterations):
    """
    Performs gradient descent for a one-variable function. 

    learning_rate: Larger numbers follow the gradient more aggressively
    number_of_iterations: The maximum number of iterations to perform
    """

    # Our starting guess is random
    model = MyModel()
    model.intercept = np.random.random_sample() -1
    model.slope = np.random.random_sample() -1

    for i in range(number_of_iterations):
        # Calculate the predicted values
        predicted_temperature = model.predict(x)

        # == OPTIMIZER ===
        # Calculate the gradient
        grad_intercept, grad_slope = calculate_gradient_MAD(predicted_temperature)
        # Update the estimation of the line
        model.slope -= learning_rate * grad_slope
        model.intercept -= learning_rate*grad_intercept
        # Print the current estimation and cost every 1000 iterations
        if( i % 1000 == 0):
            estimate = model.predict(x)
            cost = cost_function_MAD(estimate)
            print(f"Estimate number {i}:", model.get_summary(), f"Cost: {cost}")

    # Print the final model
    print(f"Final estimate:", model.get_summary())
    return model

def gradient_descent_MSD(learning_rate, number_of_iterations):
    """
    Performs gradient descent for a one-variable function. 

    learning_rate: Larger numbers follow the gradient more aggressively
    number_of_iterations: The maximum number of iterations to perform
    """

    # Our starting guess is random
    model = MyModel()
    model.intercept = np.random.random_sample() -1
    model.slope = np.random.random_sample() -1

    for i in range(number_of_iterations):
        # Calculate the predicted values
        predicted_temperature = model.predict(x)

        # == OPTIMIZER ===
        # Calculate the gradient
        grad_intercept, grad_slope = calculate_gradient_SSD(predicted_temperature)
        # Update the estimation of the line
        model.slope -= learning_rate * grad_slope
        model.intercept -= learning_rate * grad_intercept
        # Print the current estimation and cost every 1000 iterations
        if( i % 1000 == 0):
            estimate = model.predict(x)
            cost = cost_function_MSD(estimate)
            print(f"Estimate number {i}:", model.get_summary(), f"Cost: {cost}")

    # Print the final model
    print(f"Final estimate:", model.get_summary())
    return model
    

### Run gradient descent for MAD

In [7]:
model_trained_MAD = gradient_descent_MAD(learning_rate=1E-3, number_of_iterations=10**5)

Estimate number 0: y = -0.12204358865855476 * x + -0.9478688679161219 Cost: 1.0706002637728889
Estimate number 1000: y = -0.11335520742268505 * x + 0.014997304845545395 Cost: 0.13682557824801247
Estimate number 2000: y = -0.04070049709398922 * x + 0.12419560244958051 Cost: 0.09541330959741758
Estimate number 3000: y = 0.003540739638334446 * x + 0.1261383830296562 Cost: 0.09334601991563368
Estimate number 4000: y = 0.02770585817499473 * x + 0.12239358479510218 Cost: 0.092739352124972
Estimate number 5000: y = 0.043002401457275366 * x + 0.12327914595525363 Cost: 0.09249343875968345
Estimate number 6000: y = 0.05119970916529484 * x + 0.12220411442940313 Cost: 0.09242343883296206
Estimate number 7000: y = 0.05577383463925647 * x + 0.12248091140292097 Cost: 0.0924011464808045
Estimate number 8000: y = 0.05833076321069955 * x + 0.12255767685058809 Cost: 0.09239428178627093
Estimate number 9000: y = 0.059937727623163115 * x + 0.12284913334491518 Cost: 0.09239155222918279
Estimate number 10000

### Run gradient descent for MSD

In [8]:
model_trained_MSD = gradient_descent_MSD(learning_rate=1E-3, number_of_iterations=10**5)

Estimate number 0: y = -0.13329986361940893 * x + -0.25838280235822614 Cost: 0.16201860737150195
Estimate number 1000: y = -0.10240514882561567 * x + 0.07125643041135145 Cost: 0.018463574741844853
Estimate number 2000: y = -0.07631584376391205 * x + 0.11577911856497632 Cost: 0.015180008450800302
Estimate number 3000: y = -0.05428543581196834 * x + 0.1217926577056496 Cost: 0.014646098416521772
Estimate number 4000: y = -0.035682579056639185 * x + 0.12260496091815387 Cost: 0.01429837126386098
Estimate number 5000: y = -0.01997402262355663 * x + 0.12271474808274327 Cost: 0.014051029830538969
Estimate number 6000: y = -0.006709462995590481 * x + 0.12272963876775492 Cost: 0.013874676793208435
Estimate number 7000: y = 0.0044913458134717525 * x + 0.1227317026069007 Cost: 0.013748930455982456
Estimate number 8000: y = 0.013949489991846864 * x + 0.12273202580494269 Cost: 0.013659268457011766
Estimate number 9000: y = 0.021936099955536293 * x + 0.12273210698895476 Cost: 0.013595335983463894
Est

## Plot comparison

In [9]:
graphing.scatter_2D(data, label_x="year_scaled", label_y="min_temp_scaled", trendline= (model_trained_MAD.predict, model_trained_MSD.predict))

# TO DO
- Reverse scaling - using function
- Make plot of cost function for rescaled data 


In [26]:
data.describe()

Unnamed: 0,date,amount_of_precipitation,max_temperature,min_temperature,year,year_scaled,min_temp_scaled
count,12688,12686.0,12688.0,12688.0,12688.0,12688.0,12688.0
mean,1982-09-30 08:52:03.329129856,0.108184,56.352853,41.723045,1982.748789,-5.121099e-06,0.122732
min,1948-01-01 00:00:00,0.0,16.0,0.0,1948.002738,-0.5,-0.5
25%,1965-03-31 18:00:00,0.0,49.0,37.0,1965.24846,-0.2518345,0.052239
50%,1982-09-30 12:00:00,0.0,55.0,42.0,1982.749144,1.635844e-15,0.126866
75%,2000-03-31 06:00:00,0.12,63.0,47.0,2000.249829,0.2518345,0.201493
max,2017-06-30 00:00:00,3.06,96.0,67.0,2017.495551,0.5,0.5
std,,0.226398,11.039084,7.870117,20.207288,0.2907824,0.117464


In [16]:
true_slope_MSD = (data["min_temperature"].max() - data["min_temperature"].min())/(data["year"].max()-data["year"].min())*model_trained_MSD.slope
true_intercept_MSD = (data["min_temperature"].max() - data["min_temperature"].min())*(model_trained_MSD.intercept - model_trained_MSD.slope*(data["year"].max() + data["year"].min())/2/(data["year"].max() - data["year"].min())) + (data["min_temperature"].max() + data["min_temperature"].min())/2
print(f"{true_slope_MSD}*x + {true_intercept_MSD}")

0.06294085786633466*x + -83.07286428704538


# Conclusion

* Since average temperature rises very slowly this dataset has some problems
    * Intercept plays much more important role than slope (even for scaled data)
* With random starting parameters a lot (~10^5) iterations are required
* MSD and MAD are very similar
* Calculation of MSD is faster than MAD