In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('task_b.csv')
data=data.iloc[:,1:]

In [3]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [4]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [5]:
data.drop('y', axis = 1, inplace = False).std()

f1      488.195035
f2    10403.417325
f3        2.926662
dtype: float64

In [6]:
X = data[['f1','f2','f3']].values
Y = data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


# What if our features are with different variance 

<pre>
* <b>As part of this task you will observe how linear models work in case of data having feautres with different variance</b>
* <b>from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)</b>

> <b>Task1</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>Task2</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

<b>Task1</b>

In [7]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    
x_train, x_test, y_train_, y_test = train_test_split(X, Y, stratify = Y, test_size = 0.3)

'''
penalty = {‘l2’, ‘l1’, ‘elasticnet’}, default=’l2’
‘l1’ and ‘elasticnet’ might bring sparsity to the model (feature selection) not achievable with ‘l2’.
'''

sgd_log_clf = SGDClassifier(loss = 'log', penalty = 'l1', max_iter=1000, tol=0.01, n_jobs = -1)
sgd_log_clf.fit(X, Y)
log_loss_import = sgd_log_clf.coef_

sgd_hinge_clf = SGDClassifier(loss = 'hinge', penalty = 'l1', max_iter=1000, tol=0.01, n_jobs = -1)
sgd_hinge_clf.fit(X, Y)
hinge_loss_import = sgd_hinge_clf.coef_

# print(log_loss_import)
# print(hinge_loss_import)

<b>Task2</b>

In [8]:
std_data = data.drop('y', axis = 1)

for key in std_data.keys():
    col_mean = data[key].mean()
    col_std = data[key].std()
    std_data[key] = ((std_data[key] - col_mean) / col_std)

In [9]:
sgd_log_clf = SGDClassifier(loss = 'log', penalty = 'l1', max_iter=1000, tol=0.01, n_jobs = -1)
sgd_log_clf.fit(std_data, Y)
std_log_loss_import = sgd_log_clf.coef_

sgd_hinge_clf = SGDClassifier(loss = 'hinge', penalty = 'l1', max_iter=1000, tol=0.01, n_jobs = -1)
sgd_hinge_clf.fit(std_data, Y)
std_hinge_loss_import = sgd_hinge_clf.coef_

# print(std_log_loss_import)
# print(std_hinge_loss_import)

<h3><font color='blue'> Make sure you write the observations for each task, why a particular feautre got more importance than others</font></h3>

In [10]:
data.describe()

Unnamed: 0,f1,f2,f3,y
count,200.0,200.0,200.0,200.0
mean,10.180031,1299.986739,5.00184,0.5
std,488.195035,10403.417325,2.926662,0.501255
min,-1662.57911,-29605.563847,0.076763,0.0
25%,-303.22098,-5626.637315,2.508042,0.0
50%,4.684317,2611.405803,5.029256,0.5
75%,312.23985,8075.864754,7.436617,1.0
max,1130.609573,24131.36072,9.933769,1.0


In [11]:
std_data.describe()

Unnamed: 0,f1,f2,f3
count,200.0,200.0,200.0
mean,-4.4408920000000007e-17,-8.881784e-18,-2.609024e-16
std,1.0,1.0,1.0
min,-3.426416,-2.970711,-1.682831
25%,-0.6419586,-0.6658028,-0.8520965
50%,-0.01125721,0.1260566,0.009367869
75%,0.6187278,0.6513127,0.8319299
max,2.295045,2.194603,1.685172


In [12]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
'''score(X, y[, sample_weight]) 	 Return the mean accuracy on the given test data and labels'''

print('Mean accuracy score WITHOUT Standardization')
print('='*43)

print(f'With SGD & logloss\t\t: {sgd_log_clf.score(X, Y)}')
for index, coeff in enumerate(log_loss_import[0]):
    print(f'f{index + 1} coefficient {round(log_loss_import[0][index], 4)}')
    
print(f'\nWith SGD & hinge loss\t\t: {sgd_hinge_clf.score(X, Y)}')
for index, coeff in enumerate(hinge_loss_import[0]):
    print(f'f{index + 1} coefficient {round(hinge_loss_import[0][index], 4)}')

print('\n\nMean accuracy score WITH Standardization')
print('='*40)
print(f'With SGD & logloss\t\t: {sgd_log_clf.score(std_data, Y)}')
for index, coeff in enumerate(std_log_loss_import[0]):
    print(f'f{index + 1} coefficient {round(std_log_loss_import[0][index], 4)}')
    
print(f'\nWith SGD & hinge loss\t\t: {sgd_hinge_clf.score(std_data, Y)}')
for index, coeff in enumerate(std_hinge_loss_import[0]):
    print(f'f{index + 1} coefficient {round(std_hinge_loss_import[0][index], 4)}')

Mean accuracy score WITHOUT Standardization
With SGD & logloss		: 0.5
f1 coefficient 5557.0949
f2 coefficient 3910.858
f3 coefficient 28649.7251

With SGD & hinge loss		: 0.475
f1 coefficient 5362.8827
f2 coefficient -5909.3132
f3 coefficient 26721.6854


Mean accuracy score WITH Standardization
With SGD & logloss		: 0.905
f1 coefficient 0.0
f2 coefficient 0.0
f3 coefficient 9.1215

With SGD & hinge loss		: 0.91
f1 coefficient 0.0
f2 coefficient 1.7624
f3 coefficient 12.1782


- From the initial data it is visible that the values are having high variance.
- The standardisation helps to make the mean to zero and variance to 1.
- By doing the statndardization we are not preserving the variance of the data.
- The standardizarion value of feature `f2` is very larger than the feature of `f3`.


- By doing standardizarion the variance became 1 for all features.
- This helped the algorithm to predict the class in mucj more efficient way.
- Due to that the mean accuracy score after standardization improved by a good margin.

https://youtu.be/0HOqOcln3Z4