In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.linear_model import SGDClassifier as SGD
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Task 1

## 1.1 Loading Data

In [2]:
data = pd.read_csv('task_b.csv')
data=data.iloc[:,1:]

In [3]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [4]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [5]:
data.std()

f1      488.195035
f2    10403.417325
f3        2.926662
y         0.501255
dtype: float64

In [6]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


## 1.2 Applying Logistic Regression

In [8]:
clf = SGD(eta0=0.0001, alpha=0.0001, loss='log', random_state=15, penalty='l2', tol=1e-3, verbose=2, learning_rate='constant')
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='log', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [9]:
clf.fit(X=X,y=Y)

-- Epoch 1
Norm: 1.08, NNZs: 3, Bias: -0.001751, T: 200, Avg. loss: 2516.147588
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 0.61, NNZs: 3, Bias: -0.001551, T: 400, Avg. loss: 2621.694380
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 0.35, NNZs: 3, Bias: -0.001850, T: 600, Avg. loss: 3285.222158
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 0.64, NNZs: 3, Bias: -0.003527, T: 800, Avg. loss: 3142.216822
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 0.48, NNZs: 3, Bias: -0.004027, T: 1000, Avg. loss: 3009.886714
Total training time: 0.01 seconds.
-- Epoch 6
Norm: 1.40, NNZs: 3, Bias: -0.003523, T: 1200, Avg. loss: 3032.001946
Total training time: 0.01 seconds.
Convergence after 6 epochs took 0.01 seconds


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='log', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [10]:
clf.coef_, clf.intercept_


(array([[ 0.37170471, -1.34463853,  0.12669033]]), array([-0.00352309]))

Conclusion:
1. Second feature got the maximum value in the coefficient array.
2. Negative value for feature(f2) indicates that it pushes the classification more towards negative class
3. First feature has maximum positive value in coefficient array means that it maximum towards positive class

## 1.3 Applying SVM

In [11]:
clf = SGD(eta0=0.0001, alpha=0.0001, loss='hinge', random_state=15, penalty='l2', tol=1e-3, verbose=2, learning_rate='constant')

In [12]:
clf.fit(X=X,y=Y)

-- Epoch 1
Norm: 0.61, NNZs: 3, Bias: -0.001600, T: 200, Avg. loss: 2634.084615
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 0.68, NNZs: 3, Bias: -0.001100, T: 400, Avg. loss: 2593.136418
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 0.76, NNZs: 3, Bias: -0.000900, T: 600, Avg. loss: 3308.216351
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 0.77, NNZs: 3, Bias: -0.002700, T: 800, Avg. loss: 3155.085896
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 0.93, NNZs: 3, Bias: -0.002800, T: 1000, Avg. loss: 3080.501847
Total training time: 0.01 seconds.
-- Epoch 6
Norm: 0.43, NNZs: 3, Bias: -0.002700, T: 1200, Avg. loss: 3011.887174
Total training time: 0.01 seconds.
-- Epoch 7
Norm: 0.69, NNZs: 3, Bias: -0.002200, T: 1400, Avg. loss: 3002.132514
Total training time: 0.02 seconds.
Convergence after 7 epochs took 0.02 seconds


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='hinge', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [13]:
clf.coef_, clf.intercept_


(array([[ 0.38249139, -0.55764501,  0.15407861]]), array([-0.0022]))

Conclusion:
1. Second feature got the maximum value in the coefficient array.
2. Negative value for feature(f2) indicates that it pushes the classification more towards negative class
3. First feature has maximum positive value in coefficient array means that it maximize towards positive class  
4. Although SVM also finds feature(f2) as most important feature towards classifying in negative class but effect is less than 
   was in logictic regression.

# Task 2

## 2.1 Loading Data

In [14]:
data = pd.read_csv('task_b.csv')
data=data.iloc[:,1:]
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [15]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


## 2.2 Standardizing Data

In [23]:
std = StandardScaler()
Std_X = std.fit_transform(X,Y)


## 2.3 Applying LR on Standardized Data

In [17]:
clf = SGD(eta0=0.0001, alpha=0.0001, loss='log', random_state=15, penalty='l2', tol=1e-3, verbose=2, learning_rate='constant')
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='log', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [18]:
clf.fit(X=Std_X,y=Y)

-- Epoch 1
Norm: 0.01, NNZs: 3, Bias: 0.000001, T: 200, Avg. loss: 0.691431
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 0.02, NNZs: 3, Bias: 0.000002, T: 400, Avg. loss: 0.687922
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 0.03, NNZs: 3, Bias: 0.000002, T: 600, Avg. loss: 0.684449
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 0.03, NNZs: 3, Bias: 0.000003, T: 800, Avg. loss: 0.681011
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 0.04, NNZs: 3, Bias: 0.000003, T: 1000, Avg. loss: 0.677608
Total training time: 0.01 seconds.
-- Epoch 6
Norm: 0.05, NNZs: 3, Bias: 0.000003, T: 1200, Avg. loss: 0.674240
Total training time: 0.01 seconds.
-- Epoch 7
Norm: 0.06, NNZs: 3, Bias: 0.000003, T: 1400, Avg. loss: 0.670905
Total training time: 0.01 seconds.
-- Epoch 8
Norm: 0.07, NNZs: 3, Bias: 0.000003, T: 1600, Avg. loss: 0.667605
Total training time: 0.01 seconds.
-- Epoch 9
Norm: 0.07, NNZs: 3, Bias: 0.000002, T: 1800, Avg. loss: 0.664338
Total training time: 0.02 secon

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='log', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [19]:
clf.coef_, clf.intercept_

(array([[ 0.03851242, -0.00553122,  0.88963322]]), array([-0.00060297]))

Conclusions:
1. After applying standard scaling on the data we can observe huge change in loss which comes down from 3032 to 0.413
2. There is also change in the values of coefficients.
3. 3rd feature(f3) becomes the most important feature as it has maximum value.
4. 2nd feature(f4) still has negative value implies that it most important feature in classifying the point as negative 

## 2.4 Applying SVM on Standardized Data

In [24]:
clf = SGD(eta0=0.0001, alpha=0.0001, loss='hinge', random_state=15, penalty='l2', tol=1e-3, verbose=2, learning_rate='constant')

In [27]:
clf.fit(X=Std_X,y=Y)

-- Epoch 1
Norm: 0.02, NNZs: 3, Bias: 0.000000, T: 200, Avg. loss: 0.993111
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 0.03, NNZs: 3, Bias: -0.000000, T: 400, Avg. loss: 0.978934
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 0.05, NNZs: 3, Bias: 0.000000, T: 600, Avg. loss: 0.964757
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 0.07, NNZs: 3, Bias: 0.000000, T: 800, Avg. loss: 0.950580
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 0.08, NNZs: 3, Bias: -0.000000, T: 1000, Avg. loss: 0.936403
Total training time: 0.01 seconds.
-- Epoch 6
Norm: 0.10, NNZs: 3, Bias: 0.000000, T: 1200, Avg. loss: 0.922226
Total training time: 0.01 seconds.
-- Epoch 7
Norm: 0.12, NNZs: 3, Bias: 0.000000, T: 1400, Avg. loss: 0.908049
Total training time: 0.01 seconds.
-- Epoch 8
Norm: 0.13, NNZs: 3, Bias: 0.000000, T: 1600, Avg. loss: 0.893873
Total training time: 0.01 seconds.
-- Epoch 9
Norm: 0.15, NNZs: 3, Bias: 0.000000, T: 1800, Avg. loss: 0.879696
Total training time: 0.01 sec

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0001,
              fit_intercept=True, l1_ratio=0.15, learning_rate='constant',
              loss='hinge', max_iter=1000, n_iter_no_change=5, n_jobs=None,
              penalty='l2', power_t=0.5, random_state=15, shuffle=True,
              tol=0.001, validation_fraction=0.1, verbose=2, warm_start=False)

In [28]:
clf.coef_, clf.intercept_

(array([[0.04248904, 0.02585179, 1.07272982]]), array([0.0143]))

Conclusions:
1. After applying standard scaling on the data we can observe huge change in loss which comes down from 3002 to 0.299
2. There is also change in the values of coefficients.
3. 3rd feature(f3) becomes the most important feature as it has maximum value.
 