# Random tests with pythons library

In [None]:
import pandas as pd
import numpy as np

In [38]:
data = pd.read_csv(
    "https://www.data.gouv.fr/fr/datasets/r/3004168d-bec4-44d9-a781-ef16f41856a2",
    sep="|",
    decimal=","
    # dtype={'Valeur fonciere': np.float64}
)


Columns (18,23,24,26,28,41) have mixed types.Specify dtype option on import or set low_memory=False.



In [39]:
print(data["Valeur fonciere"][:10])
print(data["Surface reelle bati"][:10])

0     37220.0
1    185100.0
2    185100.0
3    209000.0
4    134900.0
5    192000.0
6     45000.0
7     45000.0
8     65000.0
9     65000.0
Name: Valeur fonciere, dtype: float64
0     20.0
1     62.0
2      0.0
3     90.0
4    101.0
5     88.0
6     39.0
7      NaN
8      0.0
9     69.0
Name: Surface reelle bati, dtype: float64


In [73]:
data

#print(data[:1])
print(data
      .filter(items=['Valeur fonciere', 'Surface reelle bati', 'Type local'])
      .groupby(by="Type local")
      .mean()
)

                                          Valeur fonciere  Surface reelle bati
Type local                                                                    
Appartement                                  2.503463e+06            56.097625
Dépendance                                   1.429113e+06             0.000000
Local industriel. commercial ou assimilé     2.398568e+06           445.772179
Maison                                       3.688464e+05           102.968822


In [81]:
refinedData = data[(data["Type local"] == "Appartement") | (data["Type local"] == "Maison")]

In [82]:
print(refinedData[:10]
      .filter(items=['Valeur fonciere', 'Surface reelle bati', 'Type local'])
      .groupby(by="Type local")
      .mean()
)

             Valeur fonciere  Surface reelle bati
Type local                                       
Appartement     81080.000000            47.500000
Maison         160483.333333            88.833333


In [89]:
import plotly.express as px

fig = px.scatter(
    x=refinedData["Surface reelle bati"][:300],
    y=refinedData["Valeur fonciere"][:300]
)

fig.show()

In [126]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=refinedData["Surface reelle bati"][:300],
    y=refinedData["Valeur fonciere"][:300],
    mode='markers'
))

def f(x):
    return 1600 * x + 10000

x = np.linspace(0, 500, 300)
y = np.vectorize(f)(x)

print(x[:3])
print(y[:3])

fig.add_trace(go.Scatter(
    x=x,
    y=y,
    mode='lines'
))

fig.show()

[0.         1.6722408  3.34448161]
[10000.         12675.58528428 15351.17056856]


In [None]:
trainingSet = refinedData.filter(items=[
    'Valeur fonciere',
    'Surface reelle bati'
])

# Defining some helpers function to plot things

In [28]:
import plotly.graph_objects as go
import numpy as np

class Graph:
    figure = None

    def __init__(self):
        self.figure = go.Figure()

    def plot2DArray(self, array, name, start = 0, end = 100, steps = 100):
         self.figure.add_trace(
            go.Scatter(
                x = array[0],
                y = array[1],
                mode = 'markers',
                name = name
            )
         )

    def plotFunction(self, function, name, start = 0, end = 100, steps = 100):
         x = np.linspace(start, end, steps)
         y = np.vectorize(function)(x) # This is slow, check out https://stackoverflow.com/questions/35215161/most-efficient-way-to-map-function-over-numpy-array

         self.figure.add_trace(
            go.Scatter(x = x, y = y, mode = 'lines', name = name)
         )

    def render(self):
         self.figure.show()

# Step 1: Simple linear regression (univariate linear regression)


## Hypothesis

The hypothesis is the function that for each $x$ in the training set will give us the best estimation of each $y$.

Simply put, the hypothesis is the function that "solve" the problem at hands.

In the case of a univariate (one variable) linear regression the hypothesis function is a simple linear function:

$h_\theta(x) = \theta_0 + \theta_1x$

Our goal then is to find $\theta_0$ and $\theta_1$ so that $h_\theta(x)$ is close to $y$ for each of our training examples $(x, y)$

In [25]:
class Hypothesis:
    theta0 = None
    theta1 = None

    def __init__(self, theta0, theta1):
        self.theta0 = theta0
        self.theta1 = theta1

    def compute(self, x):
        return self.theta0 + (self.theta1 * x)

hypo1 = Hypothesis(10, 2)
hypo2 = Hypothesis(100, 0)

graph = Graph()
graph.plotFunction(hypo1.compute, "hypo1")
graph.plotFunction(hypo2.compute, "hypo2")
graph.render()

## Cost function

In order to find the best values for $\theta_0$ and $\theta_1$ we define the cost function as follow (m being the number of training example in our dataset):

$ J_{(\theta_0, \theta_1)} = \frac{1}{2m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^i)^2 $

And the problem become minimizing this function over $\theta_0$ and $\theta_1$.

Let's unpack the formula:

- $ h_\theta(x^{(i)}) $

Is our hypothesis function as defined last chapter applied to the $i^{th}$ example of our training data.

- $ (h_\theta(x^{(i)}) - y^i)^2 $

Is the "squared error function".

It represents the "error", how much apart from the right result $y^i$ the result of our hypothesis $h_\theta(x^{(i)})$ is.

And we take the square of that error supposedly because the square function accentuate the difference between large and small error (an "error" of 5 becomes 25 while an error of 2 become 4 making the error of 5 appear even worse)

The "squared error function" is one example of an error function, there are others. But the squared error function is one that is generally quite good.

- $ \frac{1}{2m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^i)^2 $

This $\frac{1}{2m} \sum_{i=1}^m$ part is the notation to take the average. (Sum of each elements divided by the number of elements)

So the formulas is taking the average of the error of our hypothesis over our entire dataset.

In [44]:
class CostFunction:
    hypothesis = None

    def __init__(self, hypothesis):
        self.hypothesis = hypothesis

    def squaredError(self, dataset = [[]]):
        m = len(dataset[0]) # we assume that both dimension are equal
        squaredErrorSum = 0

        for i in range(1, m):
            hx = self.hypothesis.compute(dataset[0][i])
            y = dataset[1][i]
            squaredErrorSum += (hx - y) ** 2

        return (1 / (2 * m)) * squaredErrorSum

dataset = [
    [ 0,  1,  2,  3],
    [10, 14, 18, 22]
]

# This is the right hypothesis so the squared error should be 0
hypo1 = Hypothesis(10, 4)
cf1 = CostFunction(hypo1)

# This is a wrong hypothesis so the squared error should be !== 0
hypo2 = Hypothesis(20, 1)
cf2 = CostFunction(hypo2)

# Graph that plot the 2 hypothesis and the dataset to show that hypo1 is indeed right
graph = Graph()
graph.plot2DArray(dataset, "dataset")
graph.plotFunction(hypo1.compute, "hypo1", 0, 4, 4)
graph.plotFunction(hypo2.compute, "hypo2", 0, 4, 4)
graph.render()

print(cf1.squaredError(dataset), " == 0")
print(cf2.squaredError(dataset), " != 0")


0.0  == 0
8.25  != 0


# Questions:

1) In $ \frac{1}{2m} \sum_{i=1}^m (h_\theta(x^{(i)}) - y^i)^2 $ why $ \frac{1}{2m} $ and not simply $ \frac{1}{m} $ like a regular average ?

A youtube comments says: we are dividing by m to get the mean (average) and 2 is for simplification of the derivative (a^2 =2a)

What does that mean ? Is $ \frac{1}{2m} $ simpler to derive than $ \frac{1}{m} $ ?
