In [None]:
# %pip install matplotlib seaborn scikit-learn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [8]:
df = pd.read_csv("/home/kibria/Desktop/IIT_Folders/6th_semester/AI/StockVision/data/processed/all_data.csv")
df.head()

Unnamed: 0,date,trading_code,last_traded_price,high,low,opening_price,closing_price,yesterdays_closing_price,trade,value_mn,volume
0,2008-03-06,GLAXOSMITH,260.0,288.3,232.0,241.0,256.6,245.4,170,3.9074,14650
1,2008-03-06,RASPITDATA,0.0,0.0,0.0,3.5,3.5,3.5,0,0.0,0
2,2008-03-06,PLFSL,302.5,306.0,297.0,297.0,303.5,302.25,108,2.8351,9400
3,2008-03-06,T10Y1016,0.0,0.0,0.0,100000.0,100000.0,100000.0,0,0.0,0
4,2008-03-06,T20Y1227,0.0,0.0,0.0,100000.0,100000.0,100000.0,0,0.0,0


In [7]:
df['date'] = pd.to_datetime(df['date'])
data_2020 = df[df['date'].dt.year == 2020]
data_2020.head()

Unnamed: 0,date,trading_code,last_traded_price,high,low,opening_price,closing_price,yesterdays_closing_price,trade,value_mn,volume
1458391,2020-01-01,NORTHERN,718.0,733.5,690.0,690.0,719.9,711.0,1572,29.957,41679
1458392,2020-01-01,T5Y0115,0.0,0.0,0.0,0.0,0.0,100000.0,0,0.0,0
1458393,2020-01-01,NLI1STMF,10.6,11.0,10.5,11.0,10.6,10.6,24,0.308,29122
1458394,2020-01-01,T10Y0218,0.0,0.0,0.0,0.0,0.0,100000.0,0,0.0,0
1458395,2020-01-01,T10Y0117,0.0,0.0,0.0,0.0,0.0,100000.0,0,0.0,0


### Data exploration for ML model preparation


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

print("Dataset shape:", df.shape)
print("\nDate range:", df['date'].min(), "to", df['date'].max())
print("\nNumber of unique trading codes:", df['trading_code'].nunique())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

Dataset shape: (1791069, 11)

Date range: 2008-03-06 to 2022-12-29

Number of unique trading codes: 1007

Data types:
date                         object
trading_code                 object
last_traded_price           float64
high                        float64
low                         float64
opening_price               float64
closing_price               float64
yesterdays_closing_price    float64
trade                         int64
value_mn                    float64
volume                        int64
dtype: object

Missing values:
date                        0
trading_code                0
last_traded_price           0
high                        0
low                         0
opening_price               0
closing_price               0
yesterdays_closing_price    0
trade                       0
value_mn                    0
volume                      0
dtype: int64
date                        0
trading_code                0
last_traded_price           0
high                  

In [12]:
# Data preparation for ML model
# Split data: 2008-2021 for training, 2022 for testing

# Convert date to datetime if not already done
df['date'] = pd.to_datetime(df['date'])

# Create year column for easier filtering
df['year'] = df['date'].dt.year

# Split data by year
train_data = df[df['year'] <= 2021].copy()
test_data = df[df['year'] == 2022].copy()

print(f"Training data shape: {train_data.shape}")
print(f"Training years: {train_data['year'].min()} - {train_data['year'].max()}")
print(f"Test data shape: {test_data.shape}")
print(f"Test years: {test_data['year'].min()} - {test_data['year'].max()}")

# Check if we have 2022 data
if test_data.empty:
    print("\nWarning: No 2022 data found in the dataset!")
    print("Available years:", sorted(df['year'].unique()))
else:
    print(f"\nTraining data: {len(train_data)} records")
    print(f"Test data: {len(test_data)} records")

Training data shape: (1678355, 12)
Training years: 2008 - 2021
Test data shape: (112714, 12)
Test years: 2022 - 2022

Training data: 1678355 records
Test data: 112714 records


In [13]:
# Feature Engineering for Stock Prediction
def create_features(data):
    """Create technical indicators and features for stock prediction"""
    df_features = data.copy()
    
    # Sort by trading_code and date for proper time series features
    df_features = df_features.sort_values(['trading_code', 'date'])
    
    # Technical indicators for each stock
    for code in df_features['trading_code'].unique():
        mask = df_features['trading_code'] == code
        stock_data = df_features[mask].copy()
        
        if len(stock_data) > 1:
            # Price change features
            df_features.loc[mask, 'price_change'] = stock_data['closing_price'].pct_change()
            df_features.loc[mask, 'price_change_abs'] = stock_data['closing_price'].diff()
            
            # Moving averages
            df_features.loc[mask, 'ma_5'] = stock_data['closing_price'].rolling(5).mean()
            df_features.loc[mask, 'ma_10'] = stock_data['closing_price'].rolling(10).mean()
            df_features.loc[mask, 'ma_20'] = stock_data['closing_price'].rolling(20).mean()
            
            # Volatility (rolling standard deviation)
            df_features.loc[mask, 'volatility_5'] = stock_data['closing_price'].rolling(5).std()
            df_features.loc[mask, 'volatility_10'] = stock_data['closing_price'].rolling(10).std()
            
            # Price position relative to high/low
            df_features.loc[mask, 'price_position'] = (stock_data['closing_price'] - stock_data['low']) / (stock_data['high'] - stock_data['low'])
            
            # Lagged features (previous day values)
            df_features.loc[mask, 'prev_close'] = stock_data['closing_price'].shift(1)
            df_features.loc[mask, 'prev_volume'] = stock_data['volume'].shift(1)
            df_features.loc[mask, 'prev_high'] = stock_data['high'].shift(1)
            df_features.loc[mask, 'prev_low'] = stock_data['low'].shift(1)
    
    # Additional features
    df_features['high_low_pct'] = (df_features['high'] - df_features['low']) / df_features['low'] * 100
    df_features['open_close_pct'] = (df_features['closing_price'] - df_features['opening_price']) / df_features['opening_price'] * 100
    
    # Time-based features
    df_features['month'] = df_features['date'].dt.month
    df_features['day_of_week'] = df_features['date'].dt.dayofweek
    df_features['quarter'] = df_features['date'].dt.quarter
    
    return df_features

# Apply feature engineering
print("Creating features for training data...")
train_features = create_features(train_data)
print("Creating features for test data...")
test_features = create_features(test_data)

print(f"Training data with features shape: {train_features.shape}")
print(f"Test data with features shape: {test_features.shape}")

# Display new features
print("\nNew features created:")
new_cols = [col for col in train_features.columns if col not in df.columns]
print(new_cols)

Creating features for training data...
Creating features for test data...
Training data with features shape: (1678355, 29)
Test data with features shape: (112714, 29)

New features created:
['price_change', 'price_change_abs', 'ma_5', 'ma_10', 'ma_20', 'volatility_5', 'volatility_10', 'price_position', 'prev_close', 'prev_volume', 'prev_high', 'prev_low', 'high_low_pct', 'open_close_pct', 'month', 'day_of_week', 'quarter']
