In [71]:
from os.path import exists
import requests
from typing import *

import math 
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats   # statistics package
from tqdm.notebook import trange, tqdm

# Homework 2: Probability and Linear Regression

1. Answer each question.
2. Feel free to leave in testing code and other visualization code.

### Normal Distribution

The famous Normal distribution or Gaussian distribution.

$$
X \sim \mathcal{N}(\mu, \sigma^2)
$$

means thats $X$ is a random variable with normal distribution with mean $\mu$ and scale $\sigma$.

### Bayes Rule: Continuous Version

Suppose $X$ and $Y$ are continuous random variables with joint density $p_{X, Y}(x, y)$.
1. Then
$$
p_{X, Y}(x, y) = p_{X|Y}(x | y) p(y)
$$
when $p_{X|Y}(x | y)$ exists.
2. Then
$$
p_{X, Y}(x, y) = p_{Y|X}(y | x) p(x)
$$
when $p_{Y|X}(y | x)$ exists.

### Joint Density

1. Like before we would like to talk about multiple continuous random variables.

2. The **joint probability density function** (when it exists) of two random variables $X$ and $Y$ is written
$$
p_{X, Y}(x, y)
$$
and has the property that
$$
\int_{\infty}^\infty \int_{\infty}^\infty p_{X, Y}(x, y) dx dy = 1 \,.
$$

3. The marginal probability density functions can be obtained by integration
    - $p_X(x) = \int_{-\infty}^{\infty} p_{X, Y}(x, y) dy$
    - $p_Y(y) = \int_{-\infty}^{\infty} p_{X, Y}(x, y) dx$

## Problem 1 (40 pts): A Strange Regression Model

Suppose we are fitting a regression model to a dataset $(x_i, y_i)_{1 \leq i \leq N}$
\begin{align*}
p(y^i|x^i; \theta) & = \mathcal{N}(f_1(x_1^i) \theta_1 + f_2(x_2^i) \theta_2 + \theta_3, 1) \\
p(y |x; \theta) & = \prod_{i=1}^N p(y^i|x^i; \theta)
\end{align*}
where
1. the inputs $x^i \in \mathbb{R}^2$ are 2-D vectors
2. $f_1, f_2, f_3: \mathbb{R} \rightarrow \mathbb{R}$ are arbitrary functions.
3. $\theta = (\theta_1, \theta_2, \theta_3)$ is a vector of weights

### Problem 1a (10 pts)

Implement the conditional density given below.

In [106]:
def construct_distribution(theta, x,scale):
    x = np.concatenate([x, np.ones((3, 1))], axis=1)
    normals = [
        sp.stats.norm(loc=np.dot(theta, x[0]), scale=scale),
        sp.stats.norm(loc=np.dot(theta, x[1]), scale=scale),
        sp.stats.norm(loc=np.dot(theta,x[2]), scale=scale)
    ]
    return normals


In [107]:
def model1_density(f1: Callable[[float], float], f2: Callable[[float], float], theta: np.ndarray, x: np.ndarray) -> Callable[[float], float]:
    # f1 is function f_1
    # f2 is function f_2
    # theta is some choice of weights
    # x is an input
    # return a function from y to density
    def y_toDensity(y:np.ndarray):
        scale = 1
        f1v = np.vectorize(f1)
        f2v = np.vectorize(f2)
        x[0] = f1v(x[0])
        x[1] = f2v(x[1])
        normals = construct_distribution(theta, x,scale)
        res = 1
        yIdx = 0
        for norm in normals:
            res *= norm.pdf(y[yIdx])
            yIdx += 1
        return res
    return y_toDensity

### Problem 1b (15 pts)

Write a function that solves for the weights by finding the approximate minimum of the conditional density, i.e., solve
$$
\operatorname{argmin}_{\theta} -p(y | x; \theta)
$$
1. The input $X^{N \times 2}$ is a $N \times 2$ matrix where row $i$ of the matrix contains the input $x^i$.
2. The input $Y^N$ is a length $N$ vector where each entry contains the corresponding value $y^i$.

In [108]:
def solve_for_weights1(f1: Callable[[float], float], f2: Callable[[float], float], X: np.ndarray, Y: np.ndarray) -> np.ndarray:
    thetas1 = np.linspace(-50., 50.,num=200)
    thetas2 = np.linspace(-50., 50., num=200)
    thetas3 = np.linspace(-50., 50., num=200)
    best_theta = np.array([2., 4., -28.])
    best_val = model1_density(f1, f2, best_theta, x)(Y)-1
    for theta3 in thetas3:
        curr_theta = np.array([best_theta[0], best_theta[1], theta3])
        curr_val = model1_density(f1,f2, curr_theta, X)(Y)-1
        if curr_val < best_val:
            best_val = curr_val
            best_theta[2] = theta3
    for theta1 in thetas1:
        for theta2 in thetas2:
            curr_theta = np.array([theta1, theta2, best_theta[2]])
            curr_val = model1_density(f1,f2, curr_theta, X)(Y)*-1
            if curr_val < best_val:
                best_val = curr_val
                best_theta[0] = theta1
                best_theta[1] = theta2


    # We are doing an argmin
    return best_theta


def f1(x):
    return x
def f2(x):
    return x
theta = np.array([1.,2.,3.])
x=np.array([[1.,2.],[3.,4.],[5.,6.]])
x_q = x.copy()
x_q = np.concatenate([x_q, np.ones((3, 1))], axis=1)
x_q[:,0] = f1(x_q[:,0])
x_q[:,1] = f2(x_q[:,1])
y = np.dot(x_q, theta)
solve_for_weights1(f1,f2,x,y)

array([  2.,   4., -28.])

### Problem 1c (15 pts)

Transform the probabilistic model into a system that can be solved with linear algebra.
```ts
def linear_algebra_solve(X, Y):
    X_p = np.concatenate([X, np.ones((Y.size[0], 1))], axis=1)
    return np.linalg.solve(X_p.transpose() @ X_p, X_p.transpose() @ Y)
```
That is, come up with new matrix $X'$ and vector $Y'$ such that it can be input into `linear_algebra_solve`ts to produce the same set of weights.


In [111]:
def transform(f1: Callable[[float], float], f2: Callable[[float], float], X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    X_p = X.copy()
    X_p[:,0] = f1(X_p[:,0])
    X_p[:,1] = f2(X_p[:,1])
    return X_p, Y # TODO: change me

transform(f1, f2, x,y)

(array([[1., 2.],
        [3., 4.],
        [5., 6.]]),
 array([ 8., 14., 20.]))

In [121]:
def linear_algebra_solve(X, Y):
    print(X)
    X_p = np.concatenate([X, np.ones((Y.shape[0], 1))], axis=1)
    return np.linalg.solve(X_p.transpose() @ X_p, X_p.transpose() @ Y)

In [122]:
X , Y = transform(f1, f2, x,y)
linear_algebra_solve(X, Y)

[[1. 2.]
 [3. 4.]
 [5. 6.]]


array([-30.,  33., -28.])

## Problem 2 (40 pts): Mixture Model

Suppose that we are performing regression in a setting with the following density
$$
p(y|x; \theta, \beta) = \frac{1}{2}p_1(y|x; \theta) + \frac{1}{2}p_2(y|x; \beta)
$$
where both $p_1(y|x; \theta)$ and $p_2(y|x; \theta)$ are linear regressions.

### Problem 2a (10 pts)

Write a function that generates synthetic data according to the following probabilistic model.
\begin{align*}
z^i & \sim Bern(0.5) \\
y^i & \sim \begin{cases}
\mathcal{N}(\theta x, 1) & \mbox{when $Z^i = 0$} \\
\mathcal{N}(\beta x, 1) & \mbox{when $Z^i = 1$}
\end{cases}
\end{align*}
In this example, we are assuming that the inputs are a single real number.

In [126]:
def generate_dataset(theta: np.ndarray, beta: np.ndarray, size) -> np.ndarray:
    bern = sp.stats.bernoulli(0.5)
    norm1 = sp.stats.norm(theta, 1)
    norm2 = sp.stats.norm(beta, 1)
    y = []
    
    
    for i in range(size):
        z = bern.rvs()
        if (z == 0):
            y.append(norm1.rvs())
        else:
            y.append(norm2.rvs())
    return y
theta = np.array([0.2])
beta = np.array([0.3])
generate_dataset(theta, beta,100)

[-0.1010237877876598,
 -0.2857986159026535,
 -1.1615849831802425,
 -0.8830828961232033,
 -0.5268558893763522,
 -0.47730744574892997,
 -0.5787691845198843,
 1.296466714388593,
 0.0987417205343534,
 -1.9663954365626044,
 0.7967556925582787,
 -0.15356213359522208,
 0.5245865643691608,
 0.8700192609197923,
 0.7171277146922352,
 1.9232101881524624,
 0.6660864824356139,
 0.7930097935159054,
 -0.8992087390160495,
 1.266571717485329,
 1.5991391847828653,
 0.4079412126610349,
 0.36506375940400543,
 -0.9969978797662336,
 -1.272274217227195,
 -0.20037593386097402,
 0.5674285728903488,
 0.4605138636702868,
 0.15177975749182732,
 0.9050173536388963,
 -1.2151789245560582,
 1.0341208154473736,
 -0.4435216406762849,
 -0.23369134918701612,
 -1.014905496845928,
 -0.9311516258117278,
 2.5476194458109886,
 -0.4838631646235166,
 -0.39682929008060114,
 -0.018267840417143688,
 1.432614685563641,
 -0.09381540656088017,
 0.7890727351235456,
 -0.10752425614426658,
 0.6347644405284665,
 0.21394699814747656,
 1.7

### Problem 2b (15 pts)

Write a function that implements the conditional density
$$
p(y | x; \theta, \beta)
$$
assuming that each $y^i$ is conditionally independent of each other given $z_i$.


In [137]:
def mixture_model_density(theta: float, beta: float, X: np.ndarray, y: np.ndarray) -> float:
    # X is a one dimensional array of x values
    norm1 = sp.stats.norm(loc=np.dot(theta,X), scale=1)
    norm2 = sp.stats.norm(loc=np.dot(beta,X), scale=1)
    res = 1
    for yi in y:
        print(norm1.pdf(yi))
        res *= (0.5)*(norm1.pdf(yi) + norm2.pdf(yi)) 
    return res
theta = 0.2
beta = 0.3
x = np.array([0.2,0.3])
y = generate_dataset(theta,beta,2)
mixture_model_density(theta, beta, x, y)

[0.28952351 0.29413858]
[0.39880698 0.39851958]


array([0.11634232, 0.11847346])

### Problem 2c (15 pts)

Write a function that finds the optimal parameters of $\theta$ and $\beta$, i.e., 
$$
\operatorname{argmin}_{\theta} p(y | x; \theta, \beta) \,.
$$
You can use `generate_dataset` to test if your code is working.

In [7]:
def solve_mixture_model_density(X: np.ndarray, Y: np.ndarray) -> Tuple[float, float]:
    # X contains a length N array of inputs of dimension 1
    # Y contains a length N array of outputs
    theta = 0.0
    beta = 0.0
    
    # TODO: FILL ME IN
    
    return theta, beta

## Problem 3 (20 pts): Linear Regression, Duplicate Data

Suppose we have a dataset $(x_i, y_i)_{1 \leq i \leq N}$.
Suppose we duplicate this dataset so that it now contains a copy of each item.
Explain in words what happens to the weights of the linear regression in terms of the linear algebra solution and in terms of the probabilistic interpretation. Compare and contrast the two solutions. You can additionally write code and give experimental evidence as well.

FILL ME IN