## Logistic Regression Challenge

In [1]:
#import the required packages here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Task 1: Download the data and load them into Python.
You can find the data [**here**](https://drive.google.com/file/d/0Bz9_0VdXvv9bX0MzUEhVdmpCc3c/view?usp=sharing).

**Note**
- Features and response variables are in different files.
- Be careful about number of spaces between the values in the file.

# X_train file exists. Import and clean (change column names also)

In [2]:
X = pd.read_csv("SSD_X_train.txt", sep=' ', header=None, skipinitialspace=True)

In [3]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [4]:
# get features text document and change column names
X_header = pd.read_csv("features.txt", sep=" ", header=None)
X_header.head()

Unnamed: 0,0,1
0,1,tBodyAcc-mean()-X
1,2,tBodyAcc-mean()-Y
2,3,tBodyAcc-mean()-Z
3,4,tBodyAcc-std()-X
4,5,tBodyAcc-std()-Y


In [5]:
X_columns = X_header[1].values.tolist()
X.columns = X_columns
X.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [6]:
# y_train file arimasu, bring it in
y = pd.read_csv("SSD_y_train.txt", header=None)
y.head()

Unnamed: 0,0
0,5
1,5
2,5
3,5
4,5


In [7]:
y.describe()

Unnamed: 0,0
count,7352.0
mean,3.643362
std,1.744802
min,1.0
25%,2.0
50%,4.0
75%,5.0
max,6.0


### Task 2: Create a binary target variable: categories 1,2,3 --> 1, categories 4,5,6 --> 0 
This will represent a binary variable indicating if person is walking or not.

In [8]:
y = y.replace({
    2: 1,
    3: 1,
    4: 0,
    5: 0,
    6: 0
})
y.head() # can't see if it worked. Will assume it did.

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [9]:
y.describe() # nope, do not trust

Unnamed: 0,0
count,7352.0
mean,0.446817
std,0.497197
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [10]:
X.shape

(7352, 561)

In [11]:
y.shape # ready to go

(7352, 1)

### Task 3

+ Create a Univariate Binary Logistic Regression with feature number 54, which represents `tGravityAcc-min()-Y`: gravity acceleration signals in direction of Y.
+ Compare the results of the Logistic regressions from different Python packages (sklearn, statsmodel).
+ Plot the **fit** of predicted probabilities to the original values.

In [12]:
X3 = X["tGravityAcc-min()-Y"]

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X3,y,test_size=0.25,random_state=0)

In [14]:
print(X_train.shape)
print(y_train.shape)

(5514,)
(5514, 1)


In [15]:
X_train = X_train.values.reshape(-1,1)

In [16]:
X_train.shape

(5514, 1)

In [17]:
X_test = X_test.values.reshape(-1,1) # ... should fix value errors.

In [18]:
X_test.shape

(1838, 1)

In [19]:
print(X_train.shape)
print(X_test.shape) # ... should fix value errors.

(5514, 1)
(1838, 1)


In [20]:
y_train.shape

(5514, 1)

In [21]:
X_test

array([[-0.41637958],
       [-0.11063756],
       [-0.35597628],
       ...,
       [ 0.56234036],
       [-0.02822497],
       [-0.20478042]])

In [22]:
y_test

Unnamed: 0,0
3157,0
5687,0
4140,1
4401,0
6518,0
...,...
1172,0
2433,0
5523,0
1042,0


In [23]:
from sklearn import linear_model # for Log Regression
from sklearn import metrics
model = linear_model.LogisticRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

  return f(*args, **kwargs)


0.7671381936887922

In [24]:
import statsmodels.api as sm # statsmodel from last week
statmod = sm.Logit(y_train,X_train)
statmod.fit()
smy_pred=statmod.predict(X_test.T) # previous valueerror fix
sm.Logit(y_train,X_train).fit().summary()

Optimization terminated successfully.
         Current function value: 0.492928
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.492928
         Iterations 7


0,1,2,3
Dep. Variable:,0,No. Observations:,5514.0
Model:,Logit,Df Residuals:,5513.0
Method:,MLE,Df Model:,0.0
Date:,"Tue, 12 Oct 2021",Pseudo R-squ.:,0.284
Time:,15:45:10,Log-Likelihood:,-2718.0
converged:,True,LL-Null:,-3795.9
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-5.1179,0.149,-34.345,0.000,-5.410,-4.826


### Task 4
- Try to fit a Binary Logistic Regression with all the features? How many are significant?

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [26]:
# COPY PASTE but X_train BASED ON X not X3
model = linear_model.LogisticRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

  return f(*args, **kwargs)


0.999455930359086

In [29]:
# check coef_
model.coef_[0][0] # display too huge...

0.12778675111918233

In [30]:
model.classes_

array([0, 1])

In [31]:
# Confusion matrix check from downloaded Lecture Notes
TN, FP, FN, TP = metrics.confusion_matrix(y_test, y_pred).ravel()
accuracy = (TP+TN)/(TN+FP+FN+TP)
print(accuracy)
precision = TP/(TP+FP) 
print(precision)
recall = TP/(TP+FN)
print(recall)
F1 = 2 * (precision * recall) / (precision + recall)
print(F1)

0.999455930359086
0.998745294855709
1.0
0.9993722536095417


### Task 5
Now, let's fit Multinomial Logistic regression to predict all categories. Firstly, we can start with **Univariate** model for these features number separately:
+ 4
+ 54
- 19

Check the contingency matrix to see the effect of particular features!! (each feature can be good in predicting different categories)

In [33]:
# find those features
X5 = X[['tBodyAcc-std()-X','tGravityAcc-min()-Y','tBodyAcc-energy()-Z']]

In [34]:
X_train,X_test,y_train,y_test=train_test_split(X5,y,test_size=0.25,random_state=0)

In [35]:
model = linear_model.LogisticRegression(multi_class='multinomial')
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

  return f(*args, **kwargs)


In [37]:
model.classes_

array([0, 1])

In [38]:
confusionm = metrics.confusion_matrix(y_test, y_pred)
confusionm

array([[1040,    2],
       [   1,  795]])

In [40]:
# lecture copypasta
TN, FP, FN, TP = metrics.confusion_matrix(y_test, y_pred).ravel()
accuracy = (TP+TN)/(TN+FP+FN+TP)
print(accuracy)
precision = TP/(TP+FP)
print(precision)
recall = TP/(TP+FN)
print(recall)
F1 = 2 * (precision * recall) / (precision + recall)
print(F1)

0.9983677910772579
0.9974905897114178
0.9987437185929648
0.9981167608286252


In [41]:
metrics.cluster.contingency_matrix(y_test,y_pred)

array([[1040,    2],
       [   1,  795]])

### Task 6

Fit the Multinomial Logistic Regression model again. Now, try to choose **all** the important features we have in the dataset. Compare with your peers on who will get the best predictions with the smallest number of features?

In [42]:
from sklearn.feature_selection import RFE

In [44]:
X_train,X_test,y_train,y_test=train_test_split(X5,y,test_size=0.25,random_state=0)

In [45]:
y_train = y_train.values.reshape(5514,) # value error reshape fix AGAIN
y_train.shape

(5514,)

In [47]:
est = linear_model.LogisticRegression(multi_class='multinomial')
sel = RFE(est,n_features_to_select=10, step=1)
sel = sel.fit(X_train, y_train)

In [48]:
sel.estimator_

LogisticRegression(multi_class='multinomial')

In [49]:
sel.support_

array([ True,  True,  True])

In [50]:
sel.ranking_

array([1, 1, 1])

In [51]:
sel.n_features_

3

### Task 7 (Stretch)
Create your own function for Stepwise selection. Use either sklearn or statsmodel.