In [64]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input, InputLayer, Dropout

import pandas as pd
import numpy as np

np.random.seed(42)

import os
import tensorflow as tf

#DATA NORMALIZATION(STANDARDISATION)
from sklearn.preprocessing import MinMaxScaler

#DATA PLOTTING
import matplotlib.pyplot as plt

#FOR PERFORMANCE METRICS ANALYSIS.
from keras.optimizers import Adam
from keras.losses import  MeanSquaredError
from keras.metrics import  RootMeanSquaredError

#SAVING AND LOADING MODEL
from keras.callbacks import ModelCheckpoint
from keras.models import load_model


In [65]:
nsc_data_frame = pd.read_csv("NSE_data_all_stocks_2022_jan_to_may (1).csv")
nsc_data_frame

Unnamed: 0,Date,Code,Name,12m Low,12m High,Day Low,Day High,Day Price,Previous,Change,Change%,Volume,Adjusted Price
0,3-Jan-22,EGAD,Eaagads Ltd,10,15,13.5,13.8,13.5,13.5,-,-,4000,-
1,3-Jan-22,KUKZ,Kakuzi Plc,355,427,385,385,385,385,-,-,-,-
2,3-Jan-22,KAPC,Kapchorua Tea Kenya Plc,80,101,99.5,99.5,99.5,95.5,4,4.19%,100,-
3,3-Jan-22,LIMT,Limuru Tea Plc,260,360,320,320,320,320,-,-,-,-
4,3-Jan-22,SASN,Sasini Plc,16.75,22.6,18.7,18.7,18.7,18.7,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6727,31-May-22,MSC,Mumias Sugar Company Ltd,0.27,0.27,0.27,0.27,0.27,0.27,-,-,-,-
6728,31-May-22,UNGA,Unga Group Ltd,26.1,36.4,29,29,29,30,-1,3.33%,2100,-
6729,31-May-22,SCOM,Safaricom Plc,25.5,45.25,25.95,26.45,26,26.25,-0.25,0.95%,20079900,-
6730,31-May-22,FAHR,Stanlib Fahari Income-REIT,5,7.48,5.5,5.6,5.56,5.58,-0.02,0.36%,11700,-


In [66]:
#DATA  CLEANING  # nsc_data_frame.set_index(nsc_data_frame.Date, inplace=True) to update the index  of the dataframe.

date_format = '%d-%b-%y'
nsc_data_frame.Date = pd.to_datetime(nsc_data_frame['Date'], format=date_format)
nsc_data_frame['Day_Of_The_Week'] = nsc_data_frame['Date'].dt.dayofweek

nsc_data_frame['Day_Name'] = nsc_data_frame['Date'].dt.day_name()

nsc_data_frame = nsc_data_frame.drop(columns=['Change', 'Change%', 'Volume', 'Adjusted Price'])


nsc_data_frame['Day_Sin'] = np.sin(2 * np.pi * nsc_data_frame['Day_Of_The_Week']/4.0)
nsc_data_frame['Day_Cos'] = np.cos(2 * np.pi * nsc_data_frame['Day_Of_The_Week']/4.0)

nsc_data_frame.head(10)



Unnamed: 0,Date,Code,Name,12m Low,12m High,Day Low,Day High,Day Price,Previous,Day_Of_The_Week,Day_Name,Day_Sin,Day_Cos
0,2022-01-03,EGAD,Eaagads Ltd,10.0,15.0,13.5,13.8,13.5,13.5,0,Monday,0.0,1.0
1,2022-01-03,KUKZ,Kakuzi Plc,355.0,427.0,385.0,385.0,385.0,385.0,0,Monday,0.0,1.0
2,2022-01-03,KAPC,Kapchorua Tea Kenya Plc,80.0,101.0,99.5,99.5,99.5,95.5,0,Monday,0.0,1.0
3,2022-01-03,LIMT,Limuru Tea Plc,260.0,360.0,320.0,320.0,320.0,320.0,0,Monday,0.0,1.0
4,2022-01-03,SASN,Sasini Plc,16.75,22.6,18.7,18.7,18.7,18.7,0,Monday,0.0,1.0
5,2022-01-03,WTK,Williamson Tea Kenya Plc,125.0,154.75,132.0,132.0,132.0,130.0,0,Monday,0.0,1.0
6,2022-01-03,CGEN,Car and General (K) Ltd,10.0,70.0,33.95,33.95,33.95,33.95,0,Monday,0.0,1.0
7,2022-01-03,ABSA,ABSA Bank Kenya Plc,8.6,12.95,11.6,11.9,11.75,11.85,0,Monday,0.0,1.0
8,2022-01-03,BKG,BK Group Plc,24.3,40.0,29.0,29.0,29.0,29.0,0,Monday,0.0,1.0
9,2022-01-03,COOP,Co-operative Bank of Kenya Ltd,11.5,14.0,12.9,13.5,13.0,12.95,0,Monday,0.0,1.0


In [67]:
# min_value = egad_data_frame['Day_Of_The_Week'].min()  ------ >> display(egad_data_frame.info())


#SELECT THE 1ST COLUMN AND PLOT THE DAY_SINE AGAINST TIME

# plt.figure(figsize=(12, 6))  
# plt.subplot(1, 2, 1)  # Selects the first subplot
# one_company_data_frame['Day_Sin'].plot()
# plt.title('Day_Sin Over Time')
# 
# # SELECT THE 2ND COLUMN AND PLOT THE DAY PRICE AGAINST TIME
# 
# plt.subplot(1, 2, 2)  
# one_company_data_frame['Day Price'].plot()
# plt.title('Day Price Over Time')
# 
# plt.tight_layout()  # To improve subplot spacing
# plt.show()




In [68]:
#Custom Data cleaner 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd

class NSCDataScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self,company_code ,  columns_to_return,columns_to_type_cast_and_scale):
        self.company_code = company_code
        self.minMaxScaler = MinMaxScaler()
        self.to_type_cast_and_scale= columns_to_type_cast_and_scale
    
        self.to_return= columns_to_return  #same as  columns_to_type_cast but has the company code.
        
        
    def __type_cast_and_scale_xyz_df(self, xyz_data_frame):
        for column_name in self.to_type_cast_and_scale:
            xyz_data_frame[column_name] = pd.to_numeric(xyz_data_frame[column_name], errors='coerce')
        xyz_data_frame[self.to_type_cast_and_scale] = self.minMaxScaler.fit_transform(xyz_data_frame[self.to_type_cast_and_scale])
        
        return xyz_data_frame[self.to_return]
    
    def fit(self, X, y=None):
        return self
       

    def transform(self, X):   
        """
        X - NSC dataFrame
        Selects the data for one company , then names it xyz_company_data_frame 
        NB: Scaler always return columns that are assignable to the dataframe.        
        """        
        
        xyz_company_data_frame = nsc_data_frame[nsc_data_frame['Code'] == self.company_code].reset_index(drop=True)
        type_converted_xyz_company_df = self.__type_cast_and_scale_xyz_df(
            xyz_company_data_frame       
        )
        
        return type_converted_xyz_company_df
    
__data_scaler = NSCDataScaler(
    columns_to_return=['Code','Day Price','Day High', 'Day Low'],
    columns_to_type_cast_and_scale=['Day Price','Day High', 'Day Low'],
    company_code="EGAD"
    )

preprocessed_df = __data_scaler.fit_transform(
    nsc_data_frame,    
)
    
preprocessed_df
    

Unnamed: 0,Code,Day Price,Day High,Day Low
0,EGAD,0.84,0.96,0.84
1,EGAD,0.58,0.58,0.58
2,EGAD,0.58,0.58,0.58
3,EGAD,0.96,0.96,0.96
4,EGAD,0.96,0.96,0.96
...,...,...,...,...
97,EGAD,0.82,0.82,0.82
98,EGAD,0.82,0.82,0.82
99,EGAD,0.82,0.82,0.82
100,EGAD,0.82,0.82,0.82


In [70]:
#Splitting the Dataframe into X_features and the Y_Classes
"""
WINDOW_SIZE = 5
[
  [[ds1, dp1], [ds2, dp2], [ds3, dp3], [ds4, dp4], [ds5, dp5]],
  [[ds2, dp2], [ds3, dp3], [ds4, dp4], [ds5, dp5], [ds6, dp6]]  
]
"""

def __generate_x_y_from_nsc_df(__data_frame , time_step=5):
    data_array = __data_frame.to_numpy()
    X = []
    y = []
    for i in range(len(data_array)-time_step):
        
        rows = data_array[i:i + time_step]   # print(rowing[1]) gives [12.85  1.] , that is the second row 
        
        zipped_row = [row for row in rows]  # creates a list of nd_arrays .
        
        label = data_array[i + time_step][0]
        X.append(zipped_row)
        y.append(label)
        
    return np.array(X), np.array(y)

X_data, Y_classes = __generate_x_y_from_nsc_df(df_interest , 5)

print("X_Data shape : ",X_data.shape)

print(X_data.shape , Y_classes.shape)

X_data[0]



X_Data shape :  (97, 5, 4)
(97, 5, 4) (97,)


array([['EGAD', 13.5, '13.8', '13.5'],
       ['EGAD', 12.85, '12.85', '12.85'],
       ['EGAD', 12.85, '12.85', '12.85'],
       ['EGAD', 13.8, '13.8', '13.8'],
       ['EGAD', 13.8, '13.8', '13.8']], dtype=object)

In [71]:
#NORMALIZING DATA  Scaling  using the mean and std deviation : the second option is  to make sure that we normalize the data before preprocessing.

temp_training_mean = np.mean(X_data[:, :, 0])
temp_training_std = np.std(X_data[:, :, 0])


def normalize_prices(x_data_frame):
    prices_array  = x_data_frame[:, :, 0]    
    x_data_frame[:, :, 0] = (prices_array - temp_training_mean) / temp_training_std 
    return x_data_frame

normalized_data = normalize_prices(X_data)

normalized_data


    

TypeError: ufunc 'divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
#Model Creation and Model Compilation.
dropout_rate = 0.2
input_shape=(normalized_data.shape[1], normalized_data.shape[2])

nsc_lstm_model = Sequential([Input(input_shape),
                    LSTM(units=100, return_sequences=True),
                    Dropout(rate=dropout_rate),
                    LSTM(units=100, return_sequences=False),
                    Dropout(rate= dropout_rate),
                    Dense(units=32, activation='relu'),
                    Dense(units = 1)])

nsc_lstm_model.compile(loss=MeanSquaredError(), 
              optimizer=Adam(learning_rate=0.0001),
              metrics=[RootMeanSquaredError()])

nsc_lstm_model.summary()


In [None]:

#++++++++++++++++++++++++++++++++++++++++++   SPLITTING DATA +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

def dynamic_data_split(X, y, train_ratio, val_ratio, test_ratio):
    """
    Splits data and labels into training, validation, and test sets.

    Args:
        X (a 3D numpy.ndarray): Data.
        y (numpy.ndarray): Labels.
        train_ratio (float): Ratio for training data.
        val_ratio (float): Ratio for validation data.
        test_ratio (float): Ratio for test data.

    Returns:
        tuple: (X_train, y_train, X_val, y_val, X_test, y_test).
    """
    total_samples = X.shape[0]

    train_samples = int(train_ratio * total_samples)
    val_samples = int(val_ratio * total_samples)

    X_train, y_train = X[:train_samples], y[:train_samples]
    X_val, y_val = X[train_samples:train_samples + val_samples], y[train_samples:train_samples + val_samples]
    X_test, y_test = X[train_samples + val_samples:], y[train_samples + val_samples:]

    return X_train, y_train, X_val, y_val, X_test, y_test


train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

X_train, y_train, X_val, y_val, X_test, y_test = dynamic_data_split(normalized_data,Y_classes , train_ratio, val_ratio= val_ratio,test_ratio= test_ratio)

X_train.shape,y_train.shape, X_val.shape,y_val.shape, X_test.shape, y_test.shape


In [None]:
# __nsc_model_checkpoint = ModelCheckpoint('nsc_model/', save_best_only=True)
# 
# nsc_lstm_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, callbacks=[__nsc_model_checkpoint])
