# The binary classification task

In [523]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [524]:
df = pd.read_csv('../NSE_data_all_stocks_2022_jan_to_may (1).csv')
df

Unnamed: 0,Date,Code,Name,12m Low,12m High,Day Low,Day High,Day Price,Previous,Change,Change%,Volume,Adjusted Price
0,3-Jan-22,EGAD,Eaagads Ltd,10,15,13.5,13.8,13.5,13.5,-,-,4000,-
1,3-Jan-22,KUKZ,Kakuzi Plc,355,427,385,385,385,385,-,-,-,-
2,3-Jan-22,KAPC,Kapchorua Tea Kenya Plc,80,101,99.5,99.5,99.5,95.5,4,4.19%,100,-
3,3-Jan-22,LIMT,Limuru Tea Plc,260,360,320,320,320,320,-,-,-,-
4,3-Jan-22,SASN,Sasini Plc,16.75,22.6,18.7,18.7,18.7,18.7,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6727,31-May-22,MSC,Mumias Sugar Company Ltd,0.27,0.27,0.27,0.27,0.27,0.27,-,-,-,-
6728,31-May-22,UNGA,Unga Group Ltd,26.1,36.4,29,29,29,30,-1,3.33%,2100,-
6729,31-May-22,SCOM,Safaricom Plc,25.5,45.25,25.95,26.45,26,26.25,-0.25,0.95%,20079900,-
6730,31-May-22,FAHR,Stanlib Fahari Income-REIT,5,7.48,5.5,5.6,5.56,5.58,-0.02,0.36%,11700,-


In [525]:
#Custom Data cleaner 
import random
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler   
nsc_scaler = StandardScaler()

class NSEDataPreProcessor(BaseEstimator, TransformerMixin):
    
    def __init__(self,company_code ,columns_to_type_cast, columns_to_scale, columns_to_return):
        self.company_code = company_code        
        self.to_type_cast= columns_to_type_cast 
        self.to_return= columns_to_return
        self.to_scale=columns_to_scale  #same as  columns_to_type_cast but has the company code.
        
        
    def __type_cast_and_scale_xyz_df(self, xyz_data_frame):
        for column_name in self.to_type_cast:
            xyz_data_frame[column_name] = pd.to_numeric(xyz_data_frame[column_name], errors='coerce')
        xyz_data_frame[self.to_scale] = nsc_scaler.fit_transform(xyz_data_frame[self.to_scale])
        
        
        return xyz_data_frame[self.to_return]
    
    def fit(self, X, y=None):
        return self      

    def transform(self, X):  
        xyz_company_data_frame = X[X['Code'] == self.company_code].reset_index(drop=True)
        type_converted_xyz_company_df = self.__type_cast_and_scale_xyz_df(
            xyz_company_data_frame       
        )        
        return type_converted_xyz_company_df
    
random_company_code = random.choice(df.Code)

#TPSE, UMME  giving the best results .
    
__nse_dat_preprocessor = NSEDataPreProcessor(
    company_code= random_company_code,
    columns_to_type_cast=['Previous','Day Price','Day High', 'Day Low', '12m High', '12m Low'],
    columns_to_scale=[ 'Day High', 'Day Low', '12m High', '12m Low'],
    columns_to_return=['Code','Day Price','Previous','Day High', 'Day Low', '12m High', '12m Low'], 
)
xyz_preprocessed_df = __nse_dat_preprocessor.fit_transform(
    df,    
)
display(xyz_preprocessed_df.head(2)) #Show the company being worked on .

preprocessed_df = xyz_preprocessed_df.drop(columns=['Code'])
preprocessed_df
    

Unnamed: 0,Code,Day Price,Previous,Day High,Day Low,12m High,12m Low
0,EABL,165.5,165.0,1.623216,0.577051,0.0,0.735372
1,EABL,165.0,165.5,1.265554,1.736839,0.0,0.735372


Unnamed: 0,Day Price,Previous,Day High,Day Low,12m High,12m Low
0,165.50,165.00,1.623216,0.577051,0.0,0.735372
1,165.00,165.50,1.265554,1.736839,0.0,0.735372
2,163.75,165.00,1.265554,1.092512,0.0,0.735372
3,160.75,163.75,1.265554,1.092512,0.0,0.735372
4,164.75,160.75,1.265554,1.640190,0.0,0.735372
...,...,...,...,...,...,...
97,141.00,141.00,-1.595749,-1.420361,0.0,-1.394191
98,141.00,141.00,-1.476528,-1.484793,0.0,-1.394191
99,141.00,141.00,-1.476528,-1.355928,0.0,-1.394191
100,141.50,141.00,-1.476528,-1.355928,0.0,-1.394191


In [526]:
x_features = ['Previous','Day High', 'Day Low', '12m High', '12m Low']
x_single_feature =  'Day Price'
y_classes = 'Day Price'

X = preprocessed_df[x_single_feature]
day_price_df = preprocessed_df[['Day Price','Previous' ]]
day_price_df


Unnamed: 0,Day Price,Previous
0,165.50,165.00
1,165.00,165.50
2,163.75,165.00
3,160.75,163.75
4,164.75,160.75
...,...,...
97,141.00,141.00
98,141.00,141.00
99,141.00,141.00
100,141.50,141.00


## Calculating Returns : The percentage change in stock price over the specified time period
### returns = ((currentDayPrice * previousDayPrice)*100)/previousDayPrice

In [527]:
import pandas_ta as ta

day_price_df = day_price_df.copy()

day_price_df.loc[:, 'EMA_LONG'] = ta.ema(day_price_df['Previous'], length=5)
day_price_df.loc[:, 'EMA_SHORT'] = ta.ema(day_price_df['Previous'], length=3)
day_price_df['Signal'] = 0  # Initializing the with a default value (e.g., 0 for no signal)


day_price_df.loc[day_price_df['EMA_SHORT'] > day_price_df['EMA_LONG'], 'Signal'] = 1

day_price_df.loc[day_price_df['EMA_SHORT'] < day_price_df['EMA_LONG'], 'Signal'] = -1


day_price_df = day_price_df.dropna()

day_price_df

Unnamed: 0,Day Price,Previous,EMA_LONG,EMA_SHORT,Signal
4,164.75,160.75,164.000000,162.604167,-1
5,163.25,164.75,164.250000,163.677083,-1
6,161.00,163.25,163.916667,163.463542,-1
7,160.00,161.00,162.944444,162.231771,-1
8,151.50,160.00,161.962963,161.115885,-1
...,...,...,...,...,...
97,141.00,141.00,141.198872,141.159404,-1
98,141.00,141.00,141.132581,141.079702,-1
99,141.00,141.00,141.088388,141.039851,-1
100,141.50,141.00,141.058925,141.019925,-1


In [528]:
X = day_price_df[['EMA_LONG', 'EMA_SHORT']]
y = day_price_df['Signal']
X

Unnamed: 0,EMA_LONG,EMA_SHORT
4,164.000000,162.604167
5,164.250000,163.677083
6,163.916667,163.463542
7,162.944444,162.231771
8,161.962963,161.115885
...,...,...
97,141.198872,141.159404
98,141.132581,141.079702
99,141.088388,141.039851
100,141.058925,141.019925


In [529]:
from sklearn.model_selection import train_test_split


from sklearn.model_selection import TimeSeriesSplit

X=X.values
y=y.values

tscv = TimeSeriesSplit(n_splits=10)

generalized_test_indices = []
for i, (train_indices, test_indices) in enumerate(tscv.split(X)):
    
    random_test_index = np.random.choice(test_indices)
    generalized_test_indices.append(random_test_index)


generalized_train_indices = np.setdiff1d(np.arange(len(X)), generalized_test_indices)

X_train, X_test = X[generalized_train_indices], X[generalized_test_indices]
y_train, y_test = y[generalized_train_indices], y[generalized_test_indices] 
    
X_train.shape, y_test.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape

(78, 2)

In [530]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

results=pd.DataFrame({
    "Actual": y_test,
    "Predicted":predictions
})

results



Unnamed: 0,Actual,Predicted
0,1,1
1,-1,-1
2,-1,-1
3,1,1
4,-1,1
5,-1,-1
6,1,-1
7,1,-1
8,-1,-1
9,-1,-1


In [531]:
from sklearn.metrics import  accuracy_score,recall_score, precision_score, confusion_matrix


print("Accuracy:", accuracy_score(y_test, predictions, ))
print("Recall:", precision_score(y_test, predictions, average='micro'))
print("Precision:", recall_score(y_test, predictions, average= 'micro'))

conf_matrix = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
conf_matrix


Accuracy: 0.75
Recall: 0.75
Precision: 0.75
Confusion Matrix:


array([[12,  2],
       [ 3,  3]])