In [1]:
# importing library
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import datetime
import pandas as pd
import yfinance as yf

**Logistic Regression :**
    
Logistic Regression solves classification problems as we use it for binary classification. Logistic regression estimates the probability of an event occurring based on a given dataset of independent variables for ex.stock will go up or down, vote or don't vote.

In logistic regression, a logit transformation is applied on the odds- probability of sucess is divided by probability of failure.

 In Logistic regression we have to deal with **sigmoid-function** which can be represented as :

**$$f(x) =\frac{1}{1+ e^{-x}}$$**

f(x) has value between [0,1]

It can interpreted as probability.




In [2]:

def download_data(stock,start,end):
    data={}
    ticker = yf.download(stock,start,end)
    data["Close"] = ticker["Adj Close"]
    return pd.DataFrame(data)

def construct_features(data,lags=2):
    #calculate the lagged adjusted closing prices(name=close)
    for i in range(0,lags):
        data["Lag%s" % str(i+1)] = data["Close"].shift(i+1)
    #calculate the percent of actual changes
    data["Today Change"] = data["Close"]
    data["Today Change"] = data["Today Change"].pct_change() * 100
    
    # calculate the lags in percentage(normalization)
    for i in range(0,lags):
        data["Lag%s" % str(i+1)] = data["Lag%s" % str(i+1)].pct_change()*100
    
    # direction - the target variable
    data["Direction"] = np.where(data["Today Change"] > 0 , 1 , -1)

In [3]:
start_date= datetime.datetime(2019,1,1)
end_date = datetime.datetime(2020,1,1)

stock_data = download_data("HDFCBANK.NS",start_date,end_date)
construct_features(stock_data)
stock_data = stock_data.dropna()

#features and the labels(target variables)

X = stock_data[["Lag1", "Lag2"]]
y = stock_data["Direction"]


[*********************100%***********************]  1 of 1 completed


In [4]:
# split the data into training and test set(80%-20%)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [5]:
#fitting the model
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

In [6]:
predictions = model.predict(X_test)

**Confusion Matrix**

Confusion matrix is one of the easiest and most intuitive metrics used for finding the accuracy of a classification model,where the output can be of two or more categories.

**Diagonal elements: the correct classification**

**of-diagonals: incorrect predictions**

In [7]:
print('Accuracy of the model: %.2f' % accuracy_score(y_test, predictions))
print(confusion_matrix(predictions, y_test))


Accuracy of the model: 0.54
[[ 6  5]
 [17 20]]


**From the above output we can say that Our model has shown only 54% accuracy which means our predicted set and test set match 54% of total values of test set. This model is not suitable for HDFCBANK because this model only predicts 54% of correct values.**