# The binary classification task

In [275]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [276]:
df = pd.read_csv('../NSE_data_all_stocks_2022_jan_to_may (1).csv')
df

Unnamed: 0,Date,Code,Name,12m Low,12m High,Day Low,Day High,Day Price,Previous,Change,Change%,Volume,Adjusted Price
0,3-Jan-22,EGAD,Eaagads Ltd,10,15,13.5,13.8,13.5,13.5,-,-,4000,-
1,3-Jan-22,KUKZ,Kakuzi Plc,355,427,385,385,385,385,-,-,-,-
2,3-Jan-22,KAPC,Kapchorua Tea Kenya Plc,80,101,99.5,99.5,99.5,95.5,4,4.19%,100,-
3,3-Jan-22,LIMT,Limuru Tea Plc,260,360,320,320,320,320,-,-,-,-
4,3-Jan-22,SASN,Sasini Plc,16.75,22.6,18.7,18.7,18.7,18.7,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6727,31-May-22,MSC,Mumias Sugar Company Ltd,0.27,0.27,0.27,0.27,0.27,0.27,-,-,-,-
6728,31-May-22,UNGA,Unga Group Ltd,26.1,36.4,29,29,29,30,-1,3.33%,2100,-
6729,31-May-22,SCOM,Safaricom Plc,25.5,45.25,25.95,26.45,26,26.25,-0.25,0.95%,20079900,-
6730,31-May-22,FAHR,Stanlib Fahari Income-REIT,5,7.48,5.5,5.6,5.56,5.58,-0.02,0.36%,11700,-


In [277]:
#Custom Data cleaner 
import random
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler   
nsc_scaler = StandardScaler()

class NSEDataPreProcessor(BaseEstimator, TransformerMixin):
    
    def __init__(self,company_code ,columns_to_type_cast, columns_to_scale, columns_to_return):
        self.company_code = company_code        
        self.to_type_cast= columns_to_type_cast 
        self.to_return= columns_to_return
        self.to_scale=columns_to_scale  #same as  columns_to_type_cast but has the company code.
        
        
    def __type_cast_and_scale_xyz_df(self, xyz_data_frame):
        for column_name in self.to_type_cast:
            xyz_data_frame[column_name] = pd.to_numeric(xyz_data_frame[column_name], errors='coerce')
        xyz_data_frame[self.to_scale] = nsc_scaler.fit_transform(xyz_data_frame[self.to_scale])
        
        
        return xyz_data_frame[self.to_return]
    
    def fit(self, X, y=None):
        return self      

    def transform(self, X):  
        
        """
        X - NSC dataFrame
        Selects the data for one company , then names it xyz_company_data_frame 
        NB: Scaler always return columns that are assignable to the dataframe.        
        """        
        
        xyz_company_data_frame = X[X['Code'] == self.company_code].reset_index(drop=True)
        type_converted_xyz_company_df = self.__type_cast_and_scale_xyz_df(
            xyz_company_data_frame       
        )
        
        return type_converted_xyz_company_df
    
# 'Code','Day Price','Day High', 'Day High', '12m High', '12m Low' , 'Day Price','Day High', 'Day High', '12m High', '12m Low'

random_company_code = random.choice(df.Code)

#TPSE, UMME  giving the best results .
    
__nse_dat_preprocessor = NSEDataPreProcessor(
    company_code= random_company_code,
    columns_to_type_cast=['Day Price','Day High', 'Day Low', '12m High', '12m Low'],
    columns_to_scale=[ 'Previous','Day High', 'Day Low', '12m High', '12m Low'],
    columns_to_return=['Code','Day Price','Previous','Day High', 'Day Low', '12m High', '12m Low'], 
)
xyz_preprocessed_df = __nse_dat_preprocessor.fit_transform(
    df,    
)
display(xyz_preprocessed_df.head(2)) #Show the company being worked on .

preprocessed_df = xyz_preprocessed_df.drop(columns=['Code'])
preprocessed_df
    

Unnamed: 0,Code,Day Price,Previous,Day High,Day Low,12m High,12m Low
0,CABL,1.23,0.955454,2.055462,0.99823,-8.881784e-16,-5.551115e-16
1,CABL,1.22,1.317297,1.817486,1.222749,-8.881784e-16,-5.551115e-16


Unnamed: 0,Day Price,Previous,Day High,Day Low,12m High,12m Low
0,1.23,0.955454,2.055462,0.998230,-8.881784e-16,-5.551115e-16
1,1.22,1.317297,1.817486,1.222749,-8.881784e-16,-5.551115e-16
2,1.22,1.196682,1.579510,1.110490,-8.881784e-16,-5.551115e-16
3,1.27,1.196682,1.579510,1.784047,-8.881784e-16,-5.551115e-16
4,1.21,1.799754,1.222545,1.222749,-8.881784e-16,-5.551115e-16
...,...,...,...,...,...,...
97,1.00,-1.577445,-0.800254,-1.022443,-8.881784e-16,-5.551115e-16
98,1.00,-1.456831,-1.752159,-1.583741,-8.881784e-16,-5.551115e-16
99,1.00,-1.456831,-1.157218,-1.471481,-8.881784e-16,-5.551115e-16
100,1.00,-1.456831,-1.752159,-1.246962,-8.881784e-16,-5.551115e-16


In [278]:
x_features = ['Previous','Day High', 'Day Low', '12m High', '12m Low']
x_single_feature =  'Day Price'
y_classes = 'Day Price'

X = preprocessed_df[x_single_feature]
day_price_series = preprocessed_df[x_single_feature]
day_price_series


0      1.23
1      1.22
2      1.22
3      1.27
4      1.21
       ... 
97     1.00
98     1.00
99     1.00
100    1.00
101    1.00
Name: Day Price, Length: 102, dtype: float64

## Calculating Returns : The percentage change in stock price over the specified time period
### returns = ((currentDayPrice * previousDayPrice)*100)/previousDayPrice

In [279]:
import pandas as pd

def calculate_returns_and_label(data, prediction_horizon=1, positive_threshold=0, negative_threshold=0):
    """
    Calculate returns based on the given prediction horizon and label as positive, negative, or no change.

    Parameters:
    - data: Pandas Series with stock prices.
    - prediction_horizon: Time horizon for calculating returns (default: 1).
    - positive_threshold: Threshold for positive return (default: 0).
    - negative_threshold: Threshold for negative return (default: 0).

    Returns:
    - Pandas DataFrame with columns 'Price', 'Return', and 'Label'.
    """
    
    returns = data.pct_change(prediction_horizon) * 100
    result_df = pd.DataFrame({'Price': data, 'Return': returns})
    
    result_df['Label'] = 0  # Initializes all labels as 0 : if there is no change in the prices , then 0 is set as the  label.
    result_df.loc[result_df['Return'] > positive_threshold, 'Label'] = 1  
    result_df.loc[result_df['Return'] < negative_threshold, 'Label'] = -1    

    return result_df

# day_price_df.describe() gives the stats description of the dataframe : examples ; mean ,  median ,  std deviation ... dataframe.info() on the the r hand gives the data types of the data.

result_df = calculate_returns_and_label(day_price_series)
result_df


Unnamed: 0,Price,Return,Label
0,1.23,,0
1,1.22,-0.813008,-1
2,1.22,0.000000,0
3,1.27,4.098361,1
4,1.21,-4.724409,-1
...,...,...,...
97,1.00,1.010101,1
98,1.00,0.000000,0
99,1.00,0.000000,0
100,1.00,0.000000,0


In [280]:
import utils.x_y_generator as generator

features , targets = generator.__generate_x_y_from_nsc_df(time_step=4, __data_frame= result_df['Label'])

print("X_Data shape : ",features.shape)
print("Y_Data shape : ",targets.shape)

column_names = ['Day1', 'Day2', 'Day3','Day4']
df = pd.DataFrame(features, columns=column_names)

df['day5_dayPriceLabel'] = pd.Series(data= targets)
df


X_Data shape :  (98, 4)
Y_Data shape :  (98,)


Unnamed: 0,Day1,Day2,Day3,Day4,day5_dayPriceLabel
0,0,-1,0,1,-1
1,-1,0,1,-1,-1
2,0,1,-1,-1,1
3,1,-1,-1,1,0
4,-1,-1,1,0,-1
...,...,...,...,...,...
93,-1,-1,1,-1,1
94,-1,1,-1,1,0
95,1,-1,1,0,0
96,-1,1,0,0,0


In [281]:
import utils.data_splitter as splitter

features_train, targets_train, features_test, targets_test = splitter.__dynamic_data_split(features,targets, ratio_train=0.8)

features_train.shape,targets_train.shape, features_test.shape, targets_test.shape



((78, 4), (78,), (20, 4), (20,))

In [282]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Initialize the RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
classifier.fit(features_train, targets_train)

# Make predictions on the test set
rf_predictions = classifier.predict(features_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(targets_test, rf_predictions)
print(f"Accuracy: {accuracy}")




Accuracy: 0.4


In [283]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score

gb_classifier = GradientBoostingClassifier(n_estimators=100)

gb_classifier.fit(features_train, targets_train)

gb_predictions = gb_classifier.predict(features_test)

gb_accuracy = accuracy_score(targets_test, gb_predictions)
print(f"Gradient Boosting Classifier Accuracy: {gb_accuracy}")

precision_rf = precision_score(targets_test, rf_predictions, average='micro')  # assuming binary classification
print(f"Precision (RandomForestClassifier): {precision_rf}")

precision_gb = precision_score(targets_test, gb_predictions, average='micro')  # assuming binary classification
print(f"Precision (GradientBoostingClassifier): {precision_gb}")


Gradient Boosting Classifier Accuracy: 0.4
Precision (RandomForestClassifier): 0.4
Precision (GradientBoostingClassifier): 0.4
