# Machine Learning Pretest
## First Part: Multiple Choices
## https://goo.gl/forms/emElBoR6SeBAAvMN2

## Second Part: Coding Exercise

## Exercise 1
### Simulate N coin flips and count number of head and number of tail

In [75]:
import random

def simulate_coin_flips(N, p):
    """
    return (number of head, number of tail)
    """
    result = {'H':0, 'T':0}
    for _ in range(N):
        _ran = random.choices(['H','T'], weights=(p, 1-p) )
        result[_ran[0]]+=1
    return result['H'],result['T']

In [76]:
simulate_coin_flips(10, .5)

(7, 3)

## Exercise 2
### Write a program to simulate data and answer this question: How many coin flips on average does it take to get n consecutive heads?

In [1390]:
def average_coin_flips_until_n_heads(n, p):
    """
    return the answer
    """
    consecutive_count = 0
    toss_count = 0
        
    while consecutive_count < n:
        _ran = random.choices(['H','T'], weights=(p, 1-p) )
        if _ran[0] == 'H':
            consecutive_count +=1
        toss_count+=1
        
             
    return toss_count

In [1391]:
average_coin_flips_until_n_heads(100, 0.5)

192

## Exercise 3
### Fit linear regression on the simulated data below and show that the fitted parameters are correct

In [720]:
import numpy as np

x = np.random.normal(size=(100,2))
y = 4 + 3 * x[:,0] + 5 * x[:,1]

In [911]:
w = np.array([3,5])
b = 4
np.mean(np.square(y - (x.dot(w)+b)))

2.060899114889893e-31

In [979]:
class LinearRegression():
    
    def __init__(self, learning_rate = 0.1):
        self.weigths = None
        self.bias = 0
        self.learning_rate = learning_rate
    
    def fit(self,X,Y):
        
        # init weigths parameter
        self.weigths = np.zeros(X.shape[1])
        self.bias = 0
        step_count = 0
        
        # learn until cost not improve
        while(True):
            pred_a = self.predict(X)
            cost_a = self.cost_fn(Y,pred_a)
            
            grad_w, grad_b = self.gradient(X,Y,pred_a)
            
            # update weigth parameters
            self.weigths = self.weigths - self.learning_rate*grad_w

            # update bias parameter
            self.bias = self.bias - self.learning_rate*grad_b
            
            pred_b = self.predict(X)
            cost_b = self.cost_fn(Y,pred_b)
            
            step_count+=1
            
            # check cost improvement
            if cost_b >= cost_a:
                print("Training loop : {}".format(step_count))
                print("Error : {}".format(cost_b))
                break
            
    def predict(self, X):
        return np.dot(X,self.weigths)+self.bias
    
    def cost_fn(self,Y,preds):
        return np.mean(np.square(Y-preds))
    
    def gradient(self,X,Y,preds):
        # gradient lse loss W -> x_i*-2(y_j−(w_0*x_0+w_1*x_1+..+w_n*x_n+b))
        gradient_weigth = -2*(np.dot((Y-preds),X))/Y.shape[0]
        
        # gradient lse loss b -> (y_j*−2(w_0*x_0+w_1*x_1+..+w_n*x_n+b))
        gradient_bias = -2*(np.mean(Y-preds))
        
        return gradient_weigth, gradient_bias

In [982]:
model = LinearRegression(learning_rate = 0.1)
model.fit(x,y)

print("W : {}".format(model.weigths))
print("b : {}".format(model.bias))

Training loop : 235
Error : 8.877643412130962e-30
W : [3. 5.]
b : 3.999999999999999


## Exercise 4
### Given credit card fraud data below, fit a classification model on the trainData and evaluate the model on the testData using Area under curve of ROC (AUC) as a metric
### The label column is "Class" and other columns are anonymized features
### The data is sample from this dataset: https://www.kaggle.com/dalpozz/creditcardfraud

In [729]:
import pandas as pd

In [730]:
trainData = pd.read_csv("trainData.csv")

In [731]:
testData = pd.read_csv("testData.csv")

In [1383]:
class LogisticRegression():
    
    def __init__(self, learning_rate = 0.1, max_iter = 100):
        self.weigths = None
        self.max_iter = max_iter
        self.learning_rate = learning_rate
    
    def fit(self,X,Y):
        
        # init weigths parameter
        self.weigths = np.zeros(X.shape[1]+1)
        step_count = 0
        
        
        # learn until cost not improve
        for _ in range(self.max_iter):
            pred_a = self.predict_prob(X)
            cost_a = self.cost_fn(Y,pred_a)
            
            grad_w = self.gradient(X,Y,pred_a)
            
            # update weigth parameters
            self.weigths = self.weigths - self.learning_rate*grad_w

            pred_b = self.predict_prob(X)
            cost_b = self.cost_fn(Y,pred_b)
            
            assert not np.isnan(cost_b)
            step_count+=1
            
            # check cost improvement
            if cost_b >= cost_a or (step_count+1)%100 == 0:
                
                print("Training loop : {}".format(step_count))
                print("Error : {}".format(cost_b))
                
                if cost_b >= cost_a:
                    break
    
    def sigmoid(self, x):
        return 1/(1+np.exp(-x))
    
    def predict_prob(self, X):
        bx = np.ones((X.shape[0], 1))
        X = np.concatenate((X,bx), axis=1)
        hx = np.dot(X, self.weigths)
        return self.sigmoid(hx)
    
    def predict(self, X):
        bx = np.ones((X.shape[0], 1))
        X = np.concatenate((X,bx), axis=1)
        hx = np.dot(X,self.weigths)
        hx = self.sigmoid(hx)
        return hx.round()
    
    def cost_fn(self,Y,preds):
        
        term1 = Y*np.log(preds)
        term2 = (1-Y)*np.log(1-preds)
        cost = term1 + term2

        return np.mean(-cost)
    
    def gradient(self,X,Y,preds):
        bx = np.ones((X.shape[0], 1))
        X = np.concatenate((X,bx), axis=1)
        gradient_weigth = np.dot(X.T,  (preds - Y))/X.shape[0]
        
        return gradient_weigth
    
    def score(self, X, Y):
        preds = self.predict(X)
        return np.mean((Y==preds).all())
        

In [1384]:
X_train = trainData.drop('Class', axis=1).values
Y_train = trainData['Class'].values
X_test = testData.drop('Class', axis=1).values
Y_test = testData['Class'].values

In [1385]:
model = LogisticRegression(learning_rate = 1e-9, max_iter=1500)
model.fit(X_train,Y_train)

print("W : {}".format(model.weigths))

Training loop : 99
Error : 0.2077993156346036
Training loop : 199
Error : 0.207798989296608
Training loop : 299
Error : 0.2077986631523494
Training loop : 399
Error : 0.20779833720165294
Training loop : 499
Error : 0.2077980114443439
Training loop : 599
Error : 0.20779768588024783
Training loop : 699
Error : 0.2077973605091903
Training loop : 799
Error : 0.20779703533099736
Training loop : 899
Error : 0.207796710345495
Training loop : 999
Error : 0.20779638555250957
Training loop : 1099
Error : 0.20779606095186753
Training loop : 1199
Error : 0.20779573654339564
Training loop : 1299
Error : 0.20779541232692073
Training loop : 1399
Error : 0.20779508830226992
Training loop : 1499
Error : 0.20779476446927053
W : [-3.77499696e-05 -1.85447123e-07  1.60612317e-07 -4.05119250e-07
  2.04321387e-07 -1.14297082e-07 -7.87352761e-08 -2.41633830e-07
  2.11177340e-08 -1.36809931e-07 -2.71139451e-07  1.55111634e-07
 -2.77177560e-07 -1.87657475e-08 -3.68844999e-07 -1.65994400e-08
 -1.99494103e-07 -3.

In [1386]:
model.score(X_test, Y_test)

0.0

In [1387]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

l = LogisticRegression(C = 1, penalty = 'l1')
l.fit(X_train,Y_train)
pred = l.predict(X_test)
roc_auc_score(pred, Y_test)

0.9762308818226173