In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('task_b.csv')
data=data.iloc[:,1:]

In [3]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [4]:
data.var()

f1    2.383344e+05
f2    1.082311e+08
f3    8.565349e+00
y     2.512563e-01
dtype: float64

In [5]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


In [6]:
s=data.groupby('y')
s.count()

#Data is perfectly balanced

Unnamed: 0_level_0,f1,f2,f3
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,100,100,100
1.0,100,100,100


# What if our features are with different variance 

<pre>
* <b>As part of this task you will observe how linear models work in case of data having feautres with different variance</b>
* <b>from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)</b>

> <b>Task1</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>Task2</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

In [7]:
#Performing Hyperparamater tuning
cv_model=SGDClassifier(loss='log',penalty='l2',n_jobs=-1,eta0=0.001,random_state=1)
parameter={'alpha':[100,50,2,1,0.1,0.2,0.5,0.6,0.8,0.01,0.001]}
validate=GridSearchCV(cv_model,parameter,cv=5,scoring='roc_auc',return_train_score=True, n_jobs=-1)
validate.fit(X,Y)
validate.best_params_

{'alpha': 0.6}

## Task 1

In [8]:
#APPLYING MODEL WITHOUT STANDARDIZATION
#Logistic Regression
model=SGDClassifier(loss='log',penalty='l2',n_jobs=-1,random_state=0,eta0=0.01,alpha=0.6)
model.fit(X,Y)
f_weights=model.coef_
imp1=(np.argsort(f_weights))

In [9]:
cv_model=SGDClassifier(loss='hinge',penalty='l2',n_jobs=-1,random_state=0,eta0=0.01)
parameter={'alpha':[150,200,100,50,1,0.1,0.2,0.01,0.001]}
validate=GridSearchCV(cv_model,parameter,cv=3,scoring='roc_auc',return_train_score=True, n_jobs=-1)
validate.fit(X,Y)
validate.best_params_

{'alpha': 100}

In [10]:
model_sv=SGDClassifier(loss='hinge',penalty='l2',n_jobs=-1,random_state=0,eta0=0.01,alpha=100)
model_sv.fit(X,Y)
f_weights_sv=model_sv.coef_
imp2=(np.argsort(f_weights_sv))

## Task 2

In [11]:
scaler=StandardScaler()
scaler.fit(X)
x_t=scaler.transform(X)

In [12]:
pd.DataFrame(x_t).std()

0    1.002509
1    1.002509
2    1.002509
dtype: float64

In [13]:
pd.DataFrame(x_t).var()

0    1.005025
1    1.005025
2    1.005025
dtype: float64

In [14]:
#Finding the best Hyperparamater
cv_model_lr=SGDClassifier(loss='log',penalty='l2',n_jobs=-1,random_state=0,eta0=0.01)
parameter={'alpha':[150,200,100,50,1,0.1,0.2,0.01,0.001]}
validate_lr=GridSearchCV(cv_model_lr,parameter,cv=5,scoring='roc_auc',return_train_score=True, n_jobs=-1)
validate_lr.fit(x_t,Y)
validate.best_params_

{'alpha': 100}

In [15]:
model_lr=SGDClassifier(loss='log',penalty='l2',n_jobs=-1,random_state=0,eta0=0.01,alpha=100)
model_lr.fit(x_t,Y)
f_weights_lr=model_lr.coef_
imp3=(np.argsort(f_weights_lr))

In [16]:
cv_model_hinge=SGDClassifier(loss='hinge',penalty='l2',n_jobs=-1,random_state=0,eta0=0.01)
parameter={'alpha':[100,50,1,0.1,0.2,0.5,0.6,0.01,0.001]}
validate_hinge=GridSearchCV(cv_model_hinge,parameter,cv=5,scoring='roc_auc',return_train_score=True, n_jobs=-1)
validate_hinge.fit(x_t,Y)
validate_hinge.best_params_

{'alpha': 0.2}

In [17]:
model_hinge=SGDClassifier(loss='hinge',penalty='l2',n_jobs=-1,random_state=0,eta0=0.01,alpha=0.2)
model_hinge.fit(x_t,Y)
f_weights_hinge=model_hinge.coef_
imp4=(np.argsort(f_weights_hinge))

In [18]:
fin_res={'Without Standardization Log Loss':imp1,'With Standardization Log Loss':imp3,'Without Standardization Hinge Loss':imp2,'With Standardization Hinge Loss':imp4}

In [19]:
for i,j in enumerate((fin_res.items())):
    print(i,j)

0 ('Without Standardization Log Loss', array([[1, 2, 0]], dtype=int64))
1 ('With Standardization Log Loss', array([[1, 0, 2]], dtype=int64))
2 ('Without Standardization Hinge Loss', array([[0, 2, 1]], dtype=int64))
3 ('With Standardization Hinge Loss', array([[0, 1, 2]], dtype=int64))


- Trying to understand the relationship between the Variance and Std Deviation

In [20]:
#Data before Standarization
print('Std Deviation of Data after Standardization')
print(pd.DataFrame(x_t).std())
print('-'*60)
print('Variance of Data "After" Standardization')
print(pd.DataFrame(x_t).var())

Std Deviation of Data after Standardization
0    1.002509
1    1.002509
2    1.002509
dtype: float64
------------------------------------------------------------
Variance of Data "After" Standardization
0    1.005025
1    1.005025
2    1.005025
dtype: float64


In [21]:
#Data after Standarization
print('Std Deviation of Data before Standardization')
print(pd.DataFrame(X).std())
print('-'*60)
print('Variance of Data before Standardization')
print(pd.DataFrame(X).var())

Std Deviation of Data before Standardization
0      488.195035
1    10403.417325
2        2.926662
dtype: float64
------------------------------------------------------------
Variance of Data before Standardization
0    2.383344e+05
1    1.082311e+08
2    8.565349e+00
dtype: float64


In [22]:
print('Correlation between features in Data Before Standardization')
print('-'*60)
print(pd.DataFrame(X).corr())
print('*'*60)
print('*'*60)
print('\nCorrelation between features in Data After Standardization')
print('-'*60)
print(pd.DataFrame(x_t).corr())

Correlation between features in Data Before Standardization
------------------------------------------------------------
          0         1         2
0  1.000000  0.065468  0.123589
1  0.065468  1.000000 -0.055561
2  0.123589 -0.055561  1.000000
************************************************************
************************************************************

Correlation between features in Data After Standardization
------------------------------------------------------------
          0         1         2
0  1.000000  0.065468  0.123589
1  0.065468  1.000000 -0.055561
2  0.123589 -0.055561  1.000000


<h3><font color='blue'> Make sure you write the observations for each task, why a particular feautre got more importance than others</font></h3>

- ### Explain how feature importance is affected by the correlation and variance(std-dev) of each of the features in each of the tasks. Why has one feature got more weight than the other?

1) The correlation between the features stay the same even after standardization, so correlation is not affected when we perform standardization.

- ### How did standardization impact the feature importance of different features and compare the results of this case with the non-standardized ones and explain the reason for differences between the weight vectors in both the cases.

1) There is no specific pattern defining the difference in weights because of standardization, however because the difference in weights is observed, we can conclude that this will definitely affect the performance of the model on a whole.

- ### Compare the results of both the models as well in each task separately and justify the difference if any.

1) There is no specific detail or justification as to why a feature is important over another, as even though weights vary, which weights are optimum/best depends on the models performance in terms of "Accuracy, Probabilities etc".