In [24]:
%reload_ext lab_black

- typing
- documentation and comments
- test cases
- requirements and versioning

'data/bse/20211229.csv'

In [1]:
import requests
import pandas
import pendulum
from pendulum import today, Date
from pathlib import Path
from typing import Union
import zipfile
import tempfile
import os
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import logging
from requests.exceptions import HTTPError, ReadTimeout, Timeout
import glob
from copy import copy

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
logging.getLogger().setLevel(logging.INFO)

# Downloader

In [8]:
class StockDownloader(object):
    def __init__(self, timeout: int = 2):
        self.timeout = timeout
        if len(glob.glob(f"{self.download_path}/*.csv")) == 0:
            self.download_past_two_years()

    def download_data_for_date(self, date: Date, replace=False):
        download_url = self.make_url_func(date)
        file_name = date.format("YYYYMMDD") + ".csv"
        if file_name not in os.listdir(self.download_path) or replace:
            try:
                #         if True:
                fd, name = tempfile.mkstemp(suffix=".zip")
                r = requests.get(
                    download_url,
                    allow_redirects=True,
                    timeout=self.timeout,
                    headers={"User-Agent": "firefox"},
                )
                r.raise_for_status()

                with open(name, "wb") as f:
                    f.write(r.content)

                zipdata = zipfile.ZipFile(name)
                zipinfos = zipdata.infolist()
                for zipinfo in zipinfos:
                    zipinfo.filename = file_name
                    zipdata.extract(zipinfo, self.download_path)

                col_order = [
                    "symbol",
                    "isin",
                    "exchange",
                    "date",
                    "open",
                    "high",
                    "low",
                    "close",
                    "volume",
                    "year",
                    "month",
                    "day",
                    "ym",
                ]
                df = (
                    pd.read_csv(self.download_path / file_name)
                    .pipe(self.reformat)
                    .assign(
                        date=date.date(),
                        year=date.year,
                        month=date.month,
                        day=date.day,
                        ym=f"{date.year}{date.month:02}",
                    )
                    .rename(columns=str.lower)
                    .query('series == "EQ"')
                    .loc[:, col_order]
                )
                df.to_csv(self.download_path / file_name, index=False)
                logging.info(
                    f"Downloaded {self.exchange} data for {date.format('DD MMM, YYYY.')}"
                )

            except HTTPError as err:
                if err.response.status_code == 404:
                    logging.info(
                        f"No {self.exchange} data available on {date.format('DD MMM, YYYY.')}"
                    )
            except (ReadTimeout, Timeout) as err:
                logging.info(
                    f"No {self.exchange} data available on {date.format('DD MMM, YYYY.')}"
                )
            except Exception as err:
                logging.warning(
                    f"{self.exchange} data not available on {date.format('DD MMM, YYYY.')}"
                )
                logging.warning(err.message)
            finally:
                os.close(fd)
        else:
            logging.info(
                f"{self.exchange} data for {date.format('DD MMM, YYYY.')} already present"
            )

    def prune_data(self, prune_weeks):
        thresh = int(today().subtract(weeks=prune_weeks).format("YYYYMMDD"))
        files_to_prune = [
            self.download_path / f"{d}.csv" for d in self.days_present if d < thresh
        ]
        for file in files_to_prune:
            os.remove(file)

    @property
    def days_present(self):
        return [
            int(d.replace(".csv", "")) for d in glob.glob1(self.download_path, "*.csv")
        ]

    def update_data(self, prune_weeks=0):
        start_date = pendulum.from_format(str(max(self.days_present)), "YYYYMMDD").add(
            days=1
        )
        if start_date < today():
            self.download_date_range(start_date, today())
        if prune_weeks:
            self.prune_data(prune_weeks)

    def download_date_range(self, start_date: Date, end_date: Date):
        assert start_date < end_date, "Start must be before end"
        dates = pd.date_range(start_date.date(), end_date.date(), freq="B").tolist()
        dates = [
            pendulum.DateTime(d.date().year, d.date().month, d.date().day)
            for d in dates if not d.strftime("%Y%m%d") in self.exclude_days
        ]
        with ProcessPoolExecutor() as executor:
            executor.map(self.download_data_for_date, dates)

    def download_past_two_years(self):
        self.download_date_range(today().subtract(years=2), today())

    def download_last_n_weeks(self, n_weeks):
        self.download_date_range(today().subtract(weeks=n_weeks), today())

In [9]:
class NseDownloader(StockDownloader):
    download_path = Path("data/nse")
    exchange = "NSE"
    exclude_days = []

    def make_url_func(self, date: Date):
        year = date.year
        month_name = date.format("MMM").upper()
        date_str = date.format("DDMMMYYYY").upper()
        return f"https://archives.nseindia.com/content/historical/EQUITIES/{year}/{month_name}/cm{date_str}bhav.csv.zip"

    def reformat(self, df):
        return df.rename(columns={"TOTTRDQTY": "volume"}).assign(exchange=self.exchange)


class BseDownloader(StockDownloader):
    download_path = Path("data/bse")
    exchange = "BSE"
    exclude_days = ["20211229"]

    def make_url_func(self, date: Date):
        date_str = date.format("DDMMYY").upper()
        return f"https://www.bseindia.com/download/BhavCopy/Equity/EQ{date_str}_CSV.zip"

    def reformat(self, df):
        return df.rename(
            columns={
                "NO_OF_SHRS": "volume",
                "SC_NAME": "symbol",
                "SC_CODE": "isin",
                "SC_TYPE": "series",
            }
        ).assign(
            exchange=self.exchange,
            series=lambda df: np.where(df.series == "Q", "EQ", df.series),
        )

In [10]:
nse_downloader = NseDownloader()
bse_downloader = BseDownloader()

In [11]:
nse_downloader.update_data(prune_weeks=80)

INFO:root:Downloaded NSE data for 02 Feb, 2022.
INFO:root:Downloaded NSE data for 28 Jan, 2022.
INFO:root:Downloaded NSE data for 03 Feb, 2022.
INFO:root:Downloaded NSE data for 31 Jan, 2022.
INFO:root:Downloaded NSE data for 01 Feb, 2022.


In [12]:
bse_downloader.update_data(prune_weeks=80)

INFO:root:Downloaded BSE data for 01 Feb, 2022.
INFO:root:No BSE data available on 04 Feb, 2022.
INFO:root:Downloaded BSE data for 02 Feb, 2022.
INFO:root:Downloaded BSE data for 31 Jan, 2022.
INFO:root:Downloaded BSE data for 03 Feb, 2022.
INFO:root:Downloaded BSE data for 28 Jan, 2022.


In [84]:
# df_nse = pd.concat(map(pd.read_csv, glob.glob("data/nse/*.csv")))

# df_bse = pd.concat(map(pd.read_csv, glob.glob("data/bse/*.csv")))

In [97]:
df_all = pd.concat(map(pd.read_csv, glob.glob("data/*/*.csv"))).assign(
    date=lambda df: pd.to_datetime(df.date), quarter=lambda df: df.date.dt.quarter
)

In [98]:
df_all

Unnamed: 0,symbol,isin,exchange,date,open,high,low,close,volume,year,month,day,ym,quarter
0,20MICRONS,INE144J01027,NSE,2021-08-06,64.20,66.60,64.00,66.25,102939,2021,8,6,202108,3
1,21STCENMGM,INE253B01015,NSE,2021-08-06,24.80,24.80,24.80,24.80,564,2021,8,6,202108,3
2,3MINDIA,INE470A01017,NSE,2021-08-06,24455.00,24620.00,24354.10,24446.80,926,2021,8,6,202108,3
3,5PAISA,INE618L01018,NSE,2021-08-06,498.60,518.05,496.05,512.95,45962,2021,8,6,202108,3
4,63MOONS,INE111B01023,NSE,2021-08-06,107.65,107.65,103.20,103.95,111748,2021,8,6,202108,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3450,URJAPP,890152,BSE,2021-12-14,5.85,6.03,5.80,6.03,44695,2021,12,14,202112,4
3451,AIRTELPP,890157,BSE,2021-12-14,368.90,377.00,342.00,361.00,38325,2021,12,14,202112,4
3452,RPPINFRPP,890158,BSE,2021-12-14,35.85,36.05,35.30,35.90,4305,2021,12,14,202112,4
3453,PATINTPP,890159,BSE,2021-12-14,4.96,4.96,4.96,4.96,79988,2021,12,14,202112,4


# Filters

In [99]:
backtrack_date = pendulum.from_format("20220128", "YYYYMMDD")
backtrack_date = pd.to_datetime(backtrack_date.naive())
max_date = df_all.date.max()
filter_date = max_date if max_date < backtrack_date else backtrack_date
df_all = df_all.query("date <= @backtrack_date")

In [100]:
current_month = filter_date.month
current_year = filter_date.year
current_quarter = filter_date.year

## 300% value over previous month

In [101]:
prev_month_first = (filter_date - pd.DateOffset(months=1)).replace(day=1)
grouping_vars = ["symbol", "isin", "exchange", "year", "month"]
df_300p_val_month = (
    df_all.query("date >= @prev_month_first")
    .assign(value=lambda df: df.close * df.volume)
    .sort_values(grouping_vars + ["day"])
    .groupby(grouping_vars)
    .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
    .reset_index()
    .sort_values(grouping_vars)
    .assign(
        value_lag=lambda df: df.groupby(["symbol", "isin"])["value"].shift(1),
        volume_lag=lambda df: df.groupby(["symbol", "isin"])["volume"].shift(1),
        close_lag=lambda df: df.groupby(["symbol", "isin"])["close"].shift(1),
        value_ratio=lambda df: df.value / df.value_lag,
        volume_ratio=lambda df: df.volume / df.volume_lag,
        close_ratio=lambda df: df.close / df.close_lag,
    )
    .query("value_lag.notna()", engine="python")
    #     .query("value_ratio>3 & close_ratio>1 & value>2_000_000")
)

In [102]:
expected_300_m = pd.read_excel("data/Book1.xlsx", sheet_name=1)["Ticker Symbol"]
expected_300_m = set(expected_300_m[expected_300_m.notna()].tolist())
actuals_300_m = set(df_300p_val_month.query("value_ratio>3 & close_ratio>1 & value>2_000_000")["isin"].tolist())

expected_300_m.intersection(actuals_300_m), expected_300_m - actuals_300_m

({500252,
  504908,
  523371,
  524542,
  526650,
  531241,
  538708,
  539251,
  542918,
  'INE086A01029',
  'INE126A01031',
  'INE171Z01018',
  'INE285J01028',
  'INE450G01024'},
 {502901,
  502958,
  503127,
  504084,
  530215,
  961718,
  'ACCORD',
  'INE008Z01012',
  'VINNY'})

In [103]:
df_300p_val_month.loc[df_300p_val_month["isin"].isin(expected_300_m - actuals_300_m)]

Unnamed: 0,symbol,isin,exchange,year,month,value,volume,close,value_lag,volume_lag,close_lag,value_ratio,volume_ratio,close_ratio
4929,JAMSHRI,502901,BSE,2022,1,189562.0,44,4290.0,134304.2,32.0,4016.15,1.411438,1.375,1.068187
5340,KAYCEE IND,504084,BSE,2022,1,1597563.0,403,4097.0,1371013.75,368.0,3725.0,1.165242,1.095109,1.099866
5481,KINGSINFRA,530215,BSE,2022,1,197804300.0,4260541,61.4,69039519.65,1970841.0,34.6,2.865088,2.161788,1.774566
5735,LAKSHMI MILL,502958,BSE,2022,1,56113880.0,11848,5199.5,20137813.3,5390.0,3701.05,2.786493,2.198145,1.404872
8198,RAJA BAHADUR,503127,BSE,2022,1,4154237.0,944,4696.8,8123174.7,1776.0,4205.35,0.511406,0.531532,1.116863
9765,SRPL,INE008Z01012,NSE,2022,1,44798.25,1113,40.25,12620830.1,286004.0,40.25,0.00355,0.003892,1.0


## 200% value over previous quarter

In [104]:
def previous_quarter(ref):
    if ref.month < 4:
        return pd.to_datetime(pendulum.DateTime(ref.year - 1, 10, 1))
    elif ref.month < 7:
        return pd.to_datetime(pendulum.DateTime(ref.year, 1, 1))
    elif ref.month < 10:
        return pd.to_datetime(pendulum.DateTime(ref.year, 4, 1))
    return pd.to_datetime(pendulum.DateTime(ref.year, 7, 1))


prev_quarter_first = previous_quarter(filter_date)
grouping_vars = ["symbol", "isin", "exchange", "year", "quarter"]
df_200p_val_quarter = (
    df_all.query("date >= @prev_quarter_first")
    .assign(value=lambda df: df.close * df.volume)
    .sort_values(grouping_vars + ["month", "day"])
    .groupby(grouping_vars)
    .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
    .reset_index()
    .sort_values(grouping_vars)
    .assign(
        value_lag=lambda df: df.groupby(["symbol", "isin"])["value"].shift(1),
        volume_lag=lambda df: df.groupby(["symbol", "isin"])["volume"].shift(1),
        close_lag=lambda df: df.groupby(["symbol", "isin"])["close"].shift(1),
        value_ratio=lambda df: df.value / df.value_lag,
        volume_ratio=lambda df: df.volume / df.volume_lag,
        close_ratio=lambda df: df.close / df.close_lag,
    )
    .query("value_lag.notna()", engine="python")
    #     .query("value_ratio>2 & close_ratio>1 & value>6_000_000")
)

In [105]:
expected_200_q = pd.read_excel("data/Book1.xlsx")["Ticker Symbol"]
expected_200_q = set(expected_200_q[expected_200_q.notna()].tolist())
actuals_200_q = set(
    df_200p_val_quarter.query("value_ratio>2 & close_ratio>1 & value>6_000_000")[
        "isin"
    ].tolist()
)

expected_200_q.intersection(actuals_200_q), expected_200_q - actuals_200_q

({524687, 531888, 540404, 'INE592B01016', 'INE720A01015'},
 {502901, 504084, 509470, 526865, 'ACCORD', 'MITCON', 'SURANI', 'VINNY'})

In [106]:
df_200p_val_quarter.loc[
    df_200p_val_quarter["isin"].isin(expected_200_q - actuals_200_q)
]

Unnamed: 0,symbol,isin,exchange,year,quarter,value,volume,close,value_lag,volume_lag,close_lag,value_ratio,volume_ratio,close_ratio
1679,BOMOXY-B1,509470,BSE,2022,1,62463858.35,3466,16198.9,39018489.25,2836.0,13546.5,1.600878,1.222144,1.1958
5019,JAINCO PROJ.,526865,BSE,2022,1,5100036.39,530382,13.11,1544030.94,321413.0,8.67,3.303066,1.650157,1.512111
5037,JAMSHRI,502901,BSE,2022,1,189562.0,44,4290.0,1448204.15,413.0,4016.15,0.130895,0.106538,1.068187
5454,KAYCEE IND,504084,BSE,2022,1,1597562.85,403,4097.0,3432868.15,888.0,3725.0,0.465373,0.453829,1.099866


## Analysis

In [86]:
df_all.query("isin==504084")

Unnamed: 0,symbol,isin,exchange,date,open,high,low,close,volume,year,month,day,ym,quarter
353,KAYCEE IND,504084,BSE,2021-08-06,4000.0,4170.00,3936.00,3938.40,34,2021,8,6,202108,3
351,KAYCEE IND,504084,BSE,2021-07-23,4393.6,4393.60,4151.00,4151.00,7,2021,7,23,202107,3
350,KAYCEE IND,504084,BSE,2021-08-31,3935.0,3935.00,3745.05,3745.05,11,2021,8,31,202108,3
356,KAYCEE IND,504084,BSE,2021-10-05,3703.0,4040.00,3701.00,4040.00,29,2021,10,5,202110,4
344,KAYCEE IND,504084,BSE,2020-12-17,2960.0,2960.00,2750.00,2750.00,11,2020,12,17,202012,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,KAYCEE IND,504084,BSE,2021-08-05,4075.0,4076.95,4050.00,4050.00,6,2021,8,5,202108,3
352,KAYCEE IND,504084,BSE,2021-09-29,3851.0,4044.00,3851.00,4044.00,4,2021,9,29,202109,3
358,KAYCEE IND,504084,BSE,2021-06-25,3849.9,3854.55,3849.90,3854.55,23,2021,6,25,202106,2
355,KAYCEE IND,504084,BSE,2021-05-17,3000.0,3044.00,2900.00,3030.00,29,2021,5,17,202105,2


In [87]:
x = ["ACCORD", "BDL", "EIDPARRY", "ELECTCAST", "SIS", "SRPL", "VINNY", "VIPCLOTHNG"]
df_all.query("symbol.isin(@x)", engine="python")[["symbol", "isin"]].drop_duplicates()

Unnamed: 0,symbol,isin
176,BDL,INE171Z01018
364,EIDPARRY,INE126A01031
369,ELECTCAST,INE086A01029
1246,SIS,INE285J01028
1474,VIPCLOTHNG,INE450G01024
1270,SRPL,INE008Z01012


In [92]:
#  .assign(value=lambda df: df.close * df.volume)
#     .sort_values(grouping_vars + ["month", "day"])
#     .groupby(grouping_vars)
#     .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
#     .reset_index()
#     .sort_values(grouping_vars)
#     .assign(
#         value_lag=lambda df: df.groupby(["symbol", "isin"])["value"].shift(1),
#         volume_lag=lambda df: df.groupby(["symbol", "isin"])["volume"].shift(1),
#         close_lag=lambda df: df.groupby(["symbol", "isin"])["close"].shift(1),
#         value_ratio=lambda df: df.value / df.value_lag,
#         volume_ratio=lambda df: df.volume / df.volume_lag,
#         close_ratio=lambda df: df.close / df.close_lag,
#     )

In [107]:
df_tmp = (
    df_all.query("isin==502901")
    .query("date>='2021-10-01'")
    .sort_values(["year", "quarter", "month", "day"])
    .assign(value=lambda df: df.close * df.volume)
    .groupby(["year", "quarter"])
    .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
    .reset_index()
    .assign(value_last_close=lambda df: df.close * df.volume)
)

df_tmp

print("ratios")
df_tmp.iloc[1] / df_tmp.iloc[0]

Unnamed: 0,year,quarter,value,volume,close,value_last_close
0,2021,4,1448204.15,413,4016.15,1658669.95
1,2022,1,189562.0,44,4290.0,188760.0


ratios


year                1.000495
quarter             0.250000
value               0.130895
volume              0.106538
close               1.068187
value_last_close    0.113802
dtype: float64

In [108]:
87825600 / 71640000

1.225929648241206

In [109]:
df = df_all.query("isin==502901").query("year==2022 & quarter==1").sort_values("date")
df
"volume", f"{df.volume.sum():,}"
"last close", f"{df.close.iloc[-1]:,}"
"value", f"{df.close.iloc[-1]*df.volume.sum():,}"

Unnamed: 0,symbol,isin,exchange,date,open,high,low,close,volume,year,month,day,ym,quarter
320,JAMSHRI,502901,BSE,2022-01-03,3815.5,4200.0,3815.5,4190.0,14,2022,1,3,202201,1
322,JAMSHRI,502901,BSE,2022-01-10,4390.0,4390.0,4200.0,4370.0,5,2022,1,10,202201,1
317,JAMSHRI,502901,BSE,2022-01-12,4151.5,4580.0,4151.5,4580.0,12,2022,1,12,202201,1
316,JAMSHRI,502901,BSE,2022-01-14,4580.0,4580.0,4351.0,4351.0,2,2022,1,14,202201,1
322,JAMSHRI,502901,BSE,2022-01-17,4140.0,4140.0,4140.0,4140.0,2,2022,1,17,202201,1
315,JAMSHRI,502901,BSE,2022-01-18,3933.0,3950.0,3933.0,3950.0,3,2022,1,18,202201,1
314,JAMSHRI,502901,BSE,2022-01-19,4147.0,4147.0,4147.0,4147.0,2,2022,1,19,202201,1
314,JAMSHRI,502901,BSE,2022-01-21,4300.0,4300.0,4300.0,4300.0,1,2022,1,21,202201,1
320,JAMSHRI,502901,BSE,2022-01-24,4086.0,4086.0,4086.0,4086.0,1,2022,1,24,202201,1
314,JAMSHRI,502901,BSE,2022-01-27,4290.0,4290.0,4290.0,4290.0,2,2022,1,27,202201,1


('volume', '44')

('last close', '4,290.0')

('value', '188,760.0')