# The binary classification task

In [556]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [557]:
df = pd.read_csv('../NSE_data_all_stocks_2022_jan_to_may (1).csv')
df

Unnamed: 0,Date,Code,Name,12m Low,12m High,Day Low,Day High,Day Price,Previous,Change,Change%,Volume,Adjusted Price
0,3-Jan-22,EGAD,Eaagads Ltd,10,15,13.5,13.8,13.5,13.5,-,-,4000,-
1,3-Jan-22,KUKZ,Kakuzi Plc,355,427,385,385,385,385,-,-,-,-
2,3-Jan-22,KAPC,Kapchorua Tea Kenya Plc,80,101,99.5,99.5,99.5,95.5,4,4.19%,100,-
3,3-Jan-22,LIMT,Limuru Tea Plc,260,360,320,320,320,320,-,-,-,-
4,3-Jan-22,SASN,Sasini Plc,16.75,22.6,18.7,18.7,18.7,18.7,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6727,31-May-22,MSC,Mumias Sugar Company Ltd,0.27,0.27,0.27,0.27,0.27,0.27,-,-,-,-
6728,31-May-22,UNGA,Unga Group Ltd,26.1,36.4,29,29,29,30,-1,3.33%,2100,-
6729,31-May-22,SCOM,Safaricom Plc,25.5,45.25,25.95,26.45,26,26.25,-0.25,0.95%,20079900,-
6730,31-May-22,FAHR,Stanlib Fahari Income-REIT,5,7.48,5.5,5.6,5.56,5.58,-0.02,0.36%,11700,-


In [558]:
#Custom Data cleaner 
import random
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler   
nsc_scaler = StandardScaler()

class NSEDataPreProcessor(BaseEstimator, TransformerMixin):
    
    def __init__(self,company_code ,columns_to_type_cast, columns_to_scale, columns_to_return):
        self.company_code = company_code        
        self.to_type_cast= columns_to_type_cast 
        self.to_return= columns_to_return
        self.to_scale=columns_to_scale  #same as  columns_to_type_cast but has the company code.
        
        
    def __type_cast_and_scale_xyz_df(self, xyz_data_frame):
        for column_name in self.to_type_cast:
            xyz_data_frame[column_name] = pd.to_numeric(xyz_data_frame[column_name], errors='coerce')
        xyz_data_frame[self.to_scale] = nsc_scaler.fit_transform(xyz_data_frame[self.to_scale])
        
        
        return xyz_data_frame[self.to_return]
    
    def fit(self, X, y=None):
        return self      

    def transform(self, X):  
        
        """
        X - NSC dataFrame
        Selects the data for one company , then names it xyz_company_data_frame 
        NB: Scaler always return columns that are assignable to the dataframe.        
        """        
        
        xyz_company_data_frame = X[X['Code'] == self.company_code].reset_index(drop=True)
        type_converted_xyz_company_df = self.__type_cast_and_scale_xyz_df(
            xyz_company_data_frame       
        )
        
        return type_converted_xyz_company_df
    
# 'Code','Day Price','Day High', 'Day High', '12m High', '12m Low' , 'Day Price','Day High', 'Day High', '12m High', '12m Low'

random_company_code = random.choice(df.Code)

#TPSE, UMME  giving the best results .
    
__nse_dat_preprocessor = NSEDataPreProcessor(
    company_code= random_company_code,
    columns_to_type_cast=['Day Price','Day High', 'Day Low', '12m High', '12m Low'],
    columns_to_scale=[ 'Previous','Day High', 'Day Low', '12m High', '12m Low'],
    columns_to_return=['Code','Day Price','Previous','Day High', 'Day Low', '12m High', '12m Low'], 
)
xyz_preprocessed_df = __nse_dat_preprocessor.fit_transform(
    df,    
)
display(xyz_preprocessed_df.head(2)) #Show the company being worked on .

preprocessed_df = xyz_preprocessed_df.drop(columns=['Code'])
preprocessed_df
    

Unnamed: 0,Code,Day Price,Previous,Day High,Day Low,12m High,12m Low
0,EQTY,52.75,0.981321,1.460874,0.573653,0.0,-0.729344
1,EQTY,53.25,0.981321,1.080603,1.276508,0.0,-0.729344


Unnamed: 0,Day Price,Previous,Day High,Day Low,12m High,12m Low
0,52.75,0.981321,1.460874,0.573653,0.0,-0.729344
1,53.25,0.981321,1.080603,1.276508,0.0,-0.729344
2,53.00,1.161088,0.985535,1.276508,0.0,-0.729344
3,53.00,1.071204,0.985535,1.276508,0.0,-0.729344
4,53.00,1.071204,0.985535,1.276508,0.0,-0.729344
...,...,...,...,...,...,...
97,42.00,-2.883687,-2.550983,-2.677049,0.0,1.524992
98,42.90,-2.883687,-2.531970,-2.589192,0.0,1.524992
99,44.00,-2.560105,-2.151699,-2.062051,0.0,1.524992
100,45.20,-2.164615,-1.771428,-1.570053,0.0,1.524992


In [559]:
x_features = ['Previous','Day High', 'Day Low', '12m High', '12m Low']
x_single_feature =  'Day Price'
y_classes = 'Day Price'

X = preprocessed_df[x_single_feature]
day_price_series = preprocessed_df[x_single_feature]
day_price_series


0      52.75
1      53.25
2      53.00
3      53.00
4      53.00
       ...  
97     42.00
98     42.90
99     44.00
100    45.20
101    45.50
Name: Day Price, Length: 102, dtype: float64

## Calculating Returns : The percentage change in stock price over the specified time period
### returns = ((currentDayPrice * previousDayPrice)*100)/previousDayPrice

In [560]:
import pandas as pd

def calculate_returns_and_label(data, prediction_horizon=1, positive_threshold=0, negative_threshold=0):
    """
    Calculate returns based on the given prediction horizon and label as positive, negative, or no change.

    Parameters:
    - data: Pandas Series with stock prices.
    - prediction_horizon: Time horizon for calculating returns (default: 1).
    - positive_threshold: Threshold for positive return (default: 0).
    - negative_threshold: Threshold for negative return (default: 0).

    Returns:
    - Pandas DataFrame with columns 'Price', 'Return', and 'Label'.
    """
    
    returns = data.pct_change(prediction_horizon) * 100
    result_df = pd.DataFrame({'Price': data, 'Return': returns})
    
    result_df['Label'] = 0  # Initializes all labels as 0 : if there is no change in the prices , then 0 is set as the  label.
    result_df.loc[result_df['Return'] > positive_threshold, 'Label'] = 1  
    result_df.loc[result_df['Return'] < negative_threshold, 'Label'] = -1    

    return result_df

# day_price_df.describe() gives the stats description of the dataframe : examples ; mean ,  median ,  std deviation ... dataframe.info() on the the r hand gives the data types of the data.

result_df = calculate_returns_and_label(day_price_series)
result_df


Unnamed: 0,Price,Return,Label
0,52.75,,0
1,53.25,0.947867,1
2,53.00,-0.469484,-1
3,53.00,0.000000,0
4,53.00,0.000000,0
...,...,...,...
97,42.00,0.000000,0
98,42.90,2.142857,1
99,44.00,2.564103,1
100,45.20,2.727273,1


In [561]:
import utils.x_y_generator as generator

features , targets = generator.__generate_x_y_from_nsc_df(time_step=4, __data_frame= result_df['Label'])

print("X_Data shape : ",features.shape)
print("Y_Data shape : ",targets.shape)

column_names = ['Day1', 'Day2', 'Day3','Day4']
df = pd.DataFrame(features, columns=column_names)

df['day5_dayPriceLabel'] = pd.Series(data= targets)
df


X_Data shape :  (98, 4)
Y_Data shape :  (98,)


Unnamed: 0,Day1,Day2,Day3,Day4,day5_dayPriceLabel
0,0,1,-1,0,0
1,1,-1,0,0,-1
2,-1,0,0,-1,-1
3,0,0,-1,-1,-1
4,0,-1,-1,-1,-1
...,...,...,...,...,...
93,-1,-1,-1,-1,0
94,-1,-1,-1,0,1
95,-1,-1,0,1,1
96,-1,0,1,1,1


In [562]:
import utils.data_splitter as splitter

features_train, targets_train, features_test, targets_test = splitter.__dynamic_data_split(features,targets, ratio_train=0.8)

features_train.shape,targets_train.shape, features_test.shape, targets_test.shape



((78, 4), (78,), (20, 4), (20,))

In [563]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score


# Initialize the RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
classifier.fit(features_train, targets_train)

# Make predictions on the test set
rf_predictions = classifier.predict(features_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(targets_test, rf_predictions)
print(f"Accuracy: {accuracy}")

precision_rf = precision_score(targets_test, rf_predictions, average='micro')  # assuming binary classification
print(f"Precision (RandomForestClassifier): {precision_rf}")




Accuracy: 0.45
Precision (RandomForestClassifier): 0.45


In [564]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, classification_report


svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)

svm_classifier.fit(features_train, targets_train)


svm_predictions = svm_classifier.predict(features_test)

accuracy = accuracy_score(targets_test, svm_predictions)
print(f"Accuracy: {accuracy}")


precision_per_class = precision_score(targets_test, svm_predictions, average=None)

print("Precision for each class:")
for i, precision in enumerate(precision_per_class):
    print(f"Class {i}: {precision}")


classification_report_str = classification_report(targets_test, svm_predictions)



Accuracy: 0.5
Precision for each class:
Class 0: 0.5384615384615384
Class 1: 0.0
Class 2: 0.42857142857142855


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [565]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score
svm_classifier = SVC(random_state=42)

# Define a parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [1,2, 3, 4,10],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy', verbose=2)

# Perform grid search
grid_search.fit(features_train, targets_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Use the best model for predictions
best_svm_classifier = grid_search.best_estimator_
svm_predictions = best_svm_classifier.predict(features_test)

# Evaluate the performance of the best model
recall = recall_score(targets_test, svm_predictions, average='micro')
print(f"Recall of the Best Model for the {random_company_code } is: {recall}")


Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] END .....................C=0.1, degree=1, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, degree=1, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, degree=1, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, degree=1, kernel=linear; total time=   0.0s
[CV] END .....................C=0.1, degree=1, kernel=linear; total time=   0.0s
[CV] END ........................C=0.1, degree=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, degree=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, degree=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, degree=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, degree=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, degree=1, kernel=poly; total time=   0.0s
[CV] END .......................C=0.1, degree=1