- typing
- documentation and comments
- test cases
- requirements and versioning

'data/bse/20211229.csv'

Note: Value = Closing price x volume.
1. 3x value current month over previous month.
2. 2x value current month over previous month “TWICE in the last six months".
3. 2x value current quarter over previous quarter.
4. 52 week highs in NSE
5. 52 week highs in BSE
.
.
.
Data for these filtered scrips (in the same order in vertical columns):
Symbol
ISIN
Filter (on what criteria: i.e., 52 week high/3xM/2xM/2xM*twice)
Market Cap
Return on Net worth for the last 5 years (consolidated)
Return on Net worth for the last 5 years (standalone)
Net sales/Income from operations for the last 5 years (consolidated)
Net P/L After M.I & Associates for the last 5 years (consolidated)
Net sales/Income from operations for the last 5 years (standalone)
 Net Profit/(Loss) for the period for the last 5 years (standalone)
Total Debt/Equity (X) (consolidated)
Total Debt/Equity (X) (standalone)
Date of filtration

Diff market cap for bse and nse

sudo apt-get install chromium-chromedriver

pip3 install -U selenium
pip3 install webdriver_manager


In [1]:
import requests
import pandas
import pendulum
from pendulum import today, Date
from pathlib import Path
from typing import Union
import zipfile
import tempfile
import os
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import logging
from requests.exceptions import HTTPError, ReadTimeout, Timeout
import glob
from copy import copy

# import scrapy
# from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm

tqdm.pandas()

# from pandarallel import pandarallel

# pandarallel.initialize()


pd.set_option("display.max_columns", None)

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
%reload_ext lab_black

In [2]:
logging.getLogger().setLevel(logging.INFO)

# Downloader

In [3]:
class StockDownloader(object):
    def __init__(self, timeout: int = 2):
        self.timeout = timeout
        if len(glob.glob(f"{self.download_path}/*.csv")) == 0:
            self.download_past_two_years()

    def download_data_for_date(self, date: Date, replace=False):
        download_url = self.make_url_func(date)
        file_name = date.format("YYYYMMDD") + ".csv"
        if file_name not in os.listdir(self.download_path) or replace:
            try:
                #         if True:
                fd, name = tempfile.mkstemp(suffix=".zip")
                r = requests.get(
                    download_url,
                    allow_redirects=True,
                    timeout=self.timeout,
                    headers={"User-Agent": "firefox"},
                )
                r.raise_for_status()

                with open(name, "wb") as f:
                    f.write(r.content)

                zipdata = zipfile.ZipFile(name)
                zipinfos = zipdata.infolist()
                for zipinfo in zipinfos:
                    zipinfo.filename = file_name
                    zipdata.extract(zipinfo, self.download_path)

                col_order = [
                    "symbol",
                    "isin",
                    "exchange",
                    "date",
                    "open",
                    "high",
                    "low",
                    "close",
                    "volume",
                    "year",
                    "month",
                    "day",
                    "ym",
                ]
                df = (
                    pd.read_csv(self.download_path / file_name)
                    .pipe(self.reformat)
                    .assign(
                        date=date.date(),
                        year=date.year,
                        month=date.month,
                        day=date.day,
                        ym=f"{date.year}{date.month:02}",
                    )
                    .rename(columns=str.lower)
                    .query('series == "EQ"')
                    .loc[:, col_order]
                )
                df.to_csv(self.download_path / file_name, index=False)
                logging.info(
                    f"Downloaded {self.exchange} data for {date.format('DD MMM, YYYY.')}"
                )

            except HTTPError as err:
                if err.response.status_code == 404:
                    logging.info(
                        f"No {self.exchange} data available on {date.format('DD MMM, YYYY.')}"
                    )
            except (ReadTimeout, Timeout) as err:
                logging.info(
                    f"No {self.exchange} data available on {date.format('DD MMM, YYYY.')}"
                )
            except Exception as err:
                logging.warning(
                    f"{self.exchange} data not available on {date.format('DD MMM, YYYY.')}"
                )
                logging.warning(err.message)
            finally:
                os.close(fd)
        else:
            logging.info(
                f"{self.exchange} data for {date.format('DD MMM, YYYY.')} already present"
            )

    def prune_data(self, prune_weeks):
        thresh = int(today().subtract(weeks=prune_weeks).format("YYYYMMDD"))
        files_to_prune = [
            self.download_path / f"{d}.csv" for d in self.days_present if d < thresh
        ]
        for file in files_to_prune:
            os.remove(file)

    @property
    def days_present(self):
        return [
            int(d.replace(".csv", "")) for d in glob.glob1(self.download_path, "*.csv")
        ]

    def update_data(self, prune_weeks=0):
        start_date = pendulum.from_format(str(max(self.days_present)), "YYYYMMDD").add(
            days=1
        )
        if start_date < today():
            self.download_date_range(start_date, today())
        if prune_weeks:
            self.prune_data(prune_weeks)

    def download_date_range(self, start_date: Date, end_date: Date):
        assert start_date < end_date, "Start must be before end"
        dates = pd.date_range(start_date.date(), end_date.date(), freq="B").tolist()
        dates = [
            pendulum.DateTime(d.date().year, d.date().month, d.date().day)
            for d in dates
            if not d.strftime("%Y%m%d") in self.exclude_days
        ]
        with ProcessPoolExecutor() as executor:
            executor.map(self.download_data_for_date, dates)

    def download_past_two_years(self):
        self.download_date_range(today().subtract(years=2), today())

    def download_last_n_weeks(self, n_weeks):
        self.download_date_range(today().subtract(weeks=n_weeks), today())

In [4]:
class NseDownloader(StockDownloader):
    download_path = Path("data/nse")
    exchange = "NSE"
    exclude_days = []

    def make_url_func(self, date: Date):
        year = date.year
        month_name = date.format("MMM").upper()
        date_str = date.format("DDMMMYYYY").upper()
        return f"https://archives.nseindia.com/content/historical/EQUITIES/{year}/{month_name}/cm{date_str}bhav.csv.zip"

    def reformat(self, df):
        return df.rename(columns={"TOTTRDQTY": "volume"}).assign(exchange=self.exchange)


class BseDownloader(StockDownloader):
    download_path = Path("data/bse")
    exchange = "BSE"
    exclude_days = ["20211229"]

    def make_url_func(self, date: Date):
        date_str = date.format("DDMMYY").upper()
        return f"https://www.bseindia.com/download/BhavCopy/Equity/EQ{date_str}_CSV.zip"

    def reformat(self, df):
        return df.rename(
            columns={
                "NO_OF_SHRS": "volume",
                "SC_NAME": "symbol",
                "SC_CODE": "isin",
                "SC_TYPE": "series",
            }
        ).assign(
            exchange=self.exchange,
            series=lambda df: np.where(df.series == "Q", "EQ", df.series),
        )

In [5]:
nse_downloader = NseDownloader()
bse_downloader = BseDownloader()

In [6]:
nse_downloader.update_data(prune_weeks=80)

In [7]:
bse_downloader.update_data(prune_weeks=80)

In [8]:
# df_nse = pd.concat(map(pd.read_csv, glob.glob("data/nse/*.csv")))

# df_bse = pd.concat(map(pd.read_csv, glob.glob("data/bse/*.csv")))

In [9]:
df_all = pd.concat(map(pd.read_csv, glob.glob("data/*/*.csv"))).assign(
    date=lambda df: pd.to_datetime(df.date), quarter=lambda df: df.date.dt.quarter
)

# Filters

In [32]:
backtrack_date = pendulum.from_format("20220225", "YYYYMMDD")
# backtrack_date = pendulum.today()
backtrack_date = pd.to_datetime(backtrack_date.naive())
max_date = df_all.date.max()
filter_date = max_date if max_date < backtrack_date else backtrack_date
df_all = df_all.query("date <= @backtrack_date")

In [33]:
current_month = filter_date.month
current_year = filter_date.year
current_quarter = filter_date.year
max_date, filter_date, backtrack_date

(Timestamp('2022-02-24 00:00:00'),
 Timestamp('2022-02-24 00:00:00'),
 Timestamp('2022-02-25 00:00:00'))

## 300% value over previous month

In [12]:
prev_month_first = (filter_date - pd.DateOffset(months=1)).replace(day=1)
grouping_vars = ["symbol", "isin", "exchange", "year", "month"]
df_300p_val_month = (
    df_all.query("date >= @prev_month_first")
    .assign(value=lambda df: df.close * df.volume)
    .sort_values(grouping_vars + ["day"])
    .groupby(grouping_vars)
    .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
    .reset_index()
    .sort_values(grouping_vars)
    .assign(
        value_lag=lambda df: df.groupby(["symbol", "isin"])["value"].shift(1),
        volume_lag=lambda df: df.groupby(["symbol", "isin"])["volume"].shift(1),
        close_lag=lambda df: df.groupby(["symbol", "isin"])["close"].shift(1),
        value_ratio=lambda df: df.value / df.value_lag,
        volume_ratio=lambda df: df.volume / df.volume_lag,
        close_ratio=lambda df: df.close / df.close_lag,
    )
    .query("value_lag.notna()", engine="python")
    .query("value_ratio>3 & close_ratio>1 & value>2_000_000")
)

In [13]:
df_300p_val_month.shape

(134, 14)

In [14]:
# expected_300_m = pd.read_excel("data/Book1.xlsx", sheet_name=1)["Ticker Symbol"]
# expected_300_m = set(expected_300_m[expected_300_m.notna()].tolist())
# actuals_300_m = set(
#     df_300p_val_month.query("value_ratio>3 & close_ratio>1 & value>2_000_000")[
#         "isin"
#     ].tolist()
# )

# expected_300_m.intersection(actuals_300_m), expected_300_m - actuals_300_m

({523371},
 {500252,
  502901,
  502958,
  503127,
  504084,
  504908,
  524542,
  526650,
  530215,
  531241,
  538708,
  539251,
  542918,
  961718,
  'ACCORD',
  'INE008Z01012',
  'INE086A01029',
  'INE126A01031',
  'INE171Z01018',
  'INE285J01028',
  'INE450G01024',
  'VINNY'})

In [15]:
df_300p_val_month.loc[df_300p_val_month["isin"].isin(expected_300_m - actuals_300_m)]

Unnamed: 0,symbol,isin,exchange,year,month,value,volume,close,value_lag,volume_lag,close_lag,value_ratio,volume_ratio,close_ratio


## 200% value over previous quarter

In [16]:
def previous_quarter(ref):
    if ref.month < 4:
        return pd.to_datetime(pendulum.DateTime(ref.year - 1, 10, 1))
    elif ref.month < 7:
        return pd.to_datetime(pendulum.DateTime(ref.year, 1, 1))
    elif ref.month < 10:
        return pd.to_datetime(pendulum.DateTime(ref.year, 4, 1))
    return pd.to_datetime(pendulum.DateTime(ref.year, 7, 1))


prev_quarter_first = previous_quarter(filter_date)
grouping_vars = ["symbol", "isin", "exchange", "year", "quarter"]
df_200p_val_quarter = (
    df_all.query("date >= @prev_quarter_first")
    .assign(value=lambda df: df.close * df.volume)
    .sort_values(grouping_vars + ["month", "day"])
    .groupby(grouping_vars)
    .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
    .reset_index()
    .sort_values(grouping_vars)
    .assign(
        value_lag=lambda df: df.groupby(["symbol", "isin"])["value"].shift(1),
        volume_lag=lambda df: df.groupby(["symbol", "isin"])["volume"].shift(1),
        close_lag=lambda df: df.groupby(["symbol", "isin"])["close"].shift(1),
        value_ratio=lambda df: df.value / df.value_lag,
        volume_ratio=lambda df: df.volume / df.volume_lag,
        close_ratio=lambda df: df.close / df.close_lag,
    )
    .query("value_lag.notna()", engine="python")
    .query("value_ratio>2 & close_ratio>1 & value>6_000_000")
)

In [17]:
expected_200_q = pd.read_excel("data/Book1.xlsx")["Ticker Symbol"]
expected_200_q = set(expected_200_q[expected_200_q.notna()].tolist())
actuals_200_q = set(
    df_200p_val_quarter.query("value_ratio>2 & close_ratio>1 & value>6_000_000")[
        "isin"
    ].tolist()
)

expected_200_q.intersection(actuals_200_q), expected_200_q - actuals_200_q

({524687, 526865, 540404, 'INE592B01016', 'INE720A01015'},
 {502901, 504084, 509470, 531888, 'ACCORD', 'MITCON', 'SURANI', 'VINNY'})

In [18]:
df_200p_val_quarter.loc[
    df_200p_val_quarter["isin"].isin(expected_200_q - actuals_200_q)
]

Unnamed: 0,symbol,isin,exchange,year,quarter,value,volume,close,value_lag,volume_lag,close_lag,value_ratio,volume_ratio,close_ratio


## 52 Week High

In [19]:
date_52_weeks_prior = filter_date - pd.DateOffset(weeks=52)
grouping_vars = [
    "symbol",
    "isin",
    "exchange",
]

df_52_week_high_vals = (
    df_all.query("date >= @date_52_weeks_prior")
    .sort_values(grouping_vars + ["year", "month", "day"])
    .groupby(grouping_vars)
    .agg({"high": max})
)

df_52_week_highs = (
    df_all.query("date == @filter_date")
    .merge(df_52_week_high_vals, how="left", on=grouping_vars, suffixes=("", "_max"))
    .query("high==high_max")
    .drop(columns="high_max")
)

In [20]:
df_52_week_highs

Unnamed: 0,symbol,isin,exchange,date,open,high,low,close,volume,year,month,day,ym,quarter
242,BSLGOLDETF,INF209KB18D3,NSE,2022-02-24,45.50,47.90,45.50,46.87,129211,2022,2,24,202202,1
476,GENESYS,INE727B01026,NSE,2022-02-24,463.15,496.00,463.15,472.75,254742,2022,2,24,202202,1
644,ICICISILVE,INF109KC1Y56,NSE,2022-02-24,66.34,69.95,66.34,69.30,1155073,2022,2,24,202202,1
821,KOTAKGOLD,INF174KA1HJ8,NSE,2022-02-24,44.28,44.69,43.61,44.56,1741023,2022,2,24,202202,1
945,MEGASTAR,INE00EM01016,NSE,2022-02-24,159.20,159.20,152.25,159.20,90596,2022,2,24,202202,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4889,EASUN,542906,BSE,2022-02-24,32.85,32.85,32.85,32.85,1,2022,2,24,202202,1
4925,BHARTIA,543208,BSE,2022-02-24,19.86,19.86,19.86,19.86,100,2022,2,24,202202,1
5090,ICICISILVE,543452,BSE,2022-02-24,70.00,70.00,66.80,68.95,115178,2022,2,24,202202,1
5132,GOLDBEES,590095,BSE,2022-02-24,43.40,44.81,43.40,44.63,2511536,2022,2,24,202202,1


## 200% Value Twice in 6 Months 

In [21]:
date_6mos_prior = (filter_date - pd.DateOffset(months=6)).replace(day=1)

In [22]:
grouping_vars = ["symbol", "isin", "exchange", "year", "month"]
df_200p_val_twice_filter = (
    df_all.query("date >= @date_6mos_prior")
    .assign(value=lambda df: df.close * df.volume)
    .sort_values(grouping_vars + ["day"])
    .groupby(grouping_vars)
    .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
    .reset_index()
    .sort_values(grouping_vars)
    .assign(
        value_lag=lambda df: df.groupby(["symbol", "isin"])["value"].shift(1),
        volume_lag=lambda df: df.groupby(["symbol", "isin"])["volume"].shift(1),
        close_lag=lambda df: df.groupby(["symbol", "isin"])["close"].shift(1),
        value_ratio=lambda df: df.value / df.value_lag,
        volume_ratio=lambda df: df.volume / df.volume_lag,
        close_ratio=lambda df: df.close / df.close_lag,
    )
    .query("value_lag.notna()", engine="python")
    .query("value_ratio>2 & close_ratio>1 & value>2_000_000")
    .groupby(["symbol", "isin", "exchange"])
    .agg({"close_ratio": "count"})
    .reset_index()
    .rename(columns={"close_ratio": "n_double"})
    .query("n_double>=2")
    .drop(columns="n_double")
)

df_200p_val_twice = df_all.query("date == @filter_date").merge(
    df_200p_val_twice_filter, how="inner"
)

In [23]:
df_all_filtered = (
    pd.concat(
        [
            df_300p_val_month.assign(filter="300% value over prior month"),
            df_200p_val_quarter.assign(filter="200% value over prior quarter"),
            df_52_week_highs.assign(filter="52 week high"),
            df_200p_val_twice_filter.assign(filter="200% twice in 6 months"),
        ]
    )
    .groupby(["symbol", "isin", "exchange"])
    .agg({"filter": lambda x: x.str.cat(sep=", ")})
    .reset_index()
)

In [30]:
date_str = f"{backtrack_date.year}{backtrack_date.month:02}{backtrack_date.day:02}"

In [31]:
df_all_filtered.to_excel(f"data/filtered/{date_str}.xlsx", index=False)

In [26]:
# df_all_filtered.to_excel(f"data/df_filtered_latest.xlsx", index=False)

# Scraping

In [27]:
df_filtered = df_all_filtered.sample(3, random_state=3)

In [28]:
class PageFinder(object):
    """ """

    base_search_url = "http://www.moneycontrol.com/stocks/cptmarket/compsearchnew.php?topsearch_type=1&search_str="

    keys_dict = pd.read_csv("data/keys.csv").set_index("field").to_dict(orient="index")
    keys_dict = {k: v["identifier"] for k, v in keys_dict.items()}
    check_element = keys_dict["market_cap"].replace(".", "")

    ##### Selenium Setup #####
    options = webdriver.ChromeOptions()
    options.add_argument("--ignore-certificate-errors")
    options.add_argument("--incognito")
    options.add_argument("--headless")
    browser = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), options=options
    )

    def __init__(self, isin, symbol):
        self.isin, self.symbol = str(isin), str(symbol)
        self.url = self.home_content = self.ratios_url = None

        self.props = dict()
        self.try_finding_info()

    def make_url(self, term):
        return self.base_search_url + term

    def get_parsed_content(self, url):
        self.browser.get(url)
        return BeautifulSoup(self.browser.page_source)

    def validate_and_gather_info(self, term):
        url = self.make_url(term)
        content = requests.get(url).content
        if self.check_element in str(content):
            logging.info(f"Gathering Data for {self.symbol}")
            self.url = url
            soup_home = self.get_parsed_content(url)
            market_cap = soup_home.select_one(
                self.keys_dict["market_cap"].replace(" ", ".")
            ).text.replace(",", "")
            self.props["market_cap"] = float(market_cap)
            self.props["BLANK"] = ""

            self.get_ratios(soup_home)
            self.get_financials()
            return True
        else:
            return False

    def parse_series(self, content, selector):
        ls = []
        for x in content.select(self.keys_dict[selector]):
            try:
                f = float(x.text.replace(",", ""))
                ls.append(f)
            except:
                ls.append(None)
        return ls

    def get_ratios(self, soup):
        self.standalone_ratios_url = soup.select_one(self.keys_dict["ratios_url"])[
            "href"
        ]
        self.consolidated_ratios_url = self.standalone_ratios_url.replace(
            "ratiosVI", "consolidated-ratiosVI"
        )
        for name, url in zip(
            ["consolidated", "standalone"],
            [self.consolidated_ratios_url, self.standalone_ratios_url],
        ):
            content = self.get_parsed_content(url)
            self.props[name + "_rnw"] = self.parse_series(content, name + "_rnw")

    def get_financials(self):
        self.standlone_financials_url = self.standalone_ratios_url.replace(
            "ratiosVI", "results/yearly"
        )
        self.consolidated_financials_url = self.consolidated_ratios_url.replace(
            "consolidated-ratiosVI", "results/consolidated-yearly"
        )

        for name, url in zip(
            ["consolidated", "standalone"],
            [self.consolidated_financials_url, self.standlone_financials_url],
        ):
            content = self.get_parsed_content(url)
            for metric in ["sr", "np", "de"]:
                self.props[f"{name}_{metric}"] = self.parse_series(
                    content, f"{name}_{metric}"
                )

    def try_finding_info(self):
        search_terms = [self.isin] + [
            self.symbol[:i] for i in range(len(self.symbol), 3, -1)
        ]
        for term in search_terms:
            if self.validate_and_gather_info(term):
                logging.info(f"Found data for {self.symbol}")
                return
        logging.warning(f"Could not find data for {self.symbol}")

    def __repr__(self):
        return f"PageFinder({self.isin}, {self.symbol}, {self.url})"



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_linux64.zip
Driver has been saved in cache [/home/ilangurudev/.wdm/drivers/chromedriver/linux64/98.0.4758.102]


In [39]:
# df_filtered

In [37]:
df_filtered = df_filtered.assign(
    pf=lambda df: df.progress_apply(
        lambda row: PageFinder(row["isin"], row["symbol"]), axis=1
    ),
)

  0%|                                                    | 0/3 [00:00<?, ?it/s]INFO:root:Gathering Data for LOVABLE
INFO:root:Found data for LOVABLE
 67%|█████████████████████████████▎              | 2/3 [00:26<00:13, 13.21s/it]INFO:root:Gathering Data for LANDMRK PRO 
INFO:root:Found data for LANDMRK PRO 
100%|████████████████████████████████████████████| 3/3 [00:41<00:00, 14.09s/it]INFO:root:Gathering Data for KINGSINFRA  
INFO:root:Found data for KINGSINFRA  
100%|████████████████████████████████████████████| 3/3 [00:56<00:00, 18.96s/it]


In [42]:
df_filtered.pf.apply(lambda pf: pf.props)

631    {'market_cap': 205.0, 'BLANK': '', 'consolidat...
615    {'market_cap': 99.0, 'BLANK': '', 'consolidate...
581    {'market_cap': 0.0, 'BLANK': '', 'consolidated...
Name: pf, dtype: object

In [423]:
cols = df.columns.tolist()[2:]

In [425]:
for b, col in enumerate(cols):
    names = [f"{col.replace('_', ' ')}{i}" for i in range(5, 0, -1)]
    df[names] = pd.DataFrame(df[col].tolist(), index=df.index)
    df[f"BLANK {b}"] = ""

df = df.drop(columns=cols)

In [426]:
pd.concat([df_filtered.drop(columns="pf"), df], axis=1)

Unnamed: 0,symbol,isin,exchange,filter,market_cap,BLANK,consolidated rnw5,consolidated rnw4,consolidated rnw3,consolidated rnw2,consolidated rnw1,BLANK 0,standalone rnw5,standalone rnw4,standalone rnw3,standalone rnw2,standalone rnw1,BLANK 1,consolidated sr5,consolidated sr4,consolidated sr3,consolidated sr2,consolidated sr1,BLANK 2,consolidated np5,consolidated np4,consolidated np3,consolidated np2,consolidated np1,BLANK 3,consolidated de5,consolidated de4,consolidated de3,consolidated de2,consolidated de1,BLANK 4,standalone sr5,standalone sr4,standalone sr3,standalone sr2,standalone sr1,BLANK 5,standalone np5,standalone np4,standalone np3,standalone np2,standalone np1,BLANK 6,standalone de5,standalone de4,standalone de3,standalone de2,standalone de1,BLANK 7
431,GUJ AMB EXP,524226,BSE,200% value over prior quarter,4267.0,,20.4,10.99,11.31,10.16,14.73,,26.46,13.73,22.84,16.63,24.23,,4705.3,3816.59,2728.43,2524.49,3080.42,,338.07,145.84,103.66,84.09,111.41,,,,,,,,4705.3,3816.59,4021.44,3364.43,3330.81,,338.14,145.84,198.15,179.88,158.55,,452.47,181.35,269.55,232.16,199.37,
1070,SIMRAN FARMS,519566,BSE,200% twice in 6 months,0.0,,73.63,-495.28,15.62,18.31,5.43,,136.93,-446.51,28.38,36.56,37.87,,322.08,281.12,319.74,340.97,410.03,,12.6,-21.51,4.07,4.04,1.05,,0.02,,-0.11,,,,322.08,281.12,319.92,340.97,410.03,,12.49,-21.54,4.04,3.94,0.95,,21.53,-29.27,6.11,5.33,1.45,
1278,VISHAL,538598,BSE,200% value over prior quarter,670.0,,,,,,,,12.45,15.64,9.8,11.1,8.42,,,,,,,,,,,,,,,,,,,,967.54,1296.84,998.49,890.94,373.64,,18.1,30.1,17.86,15.63,10.54,,29.75,43.36,20.51,19.59,18.51,


# Testing & Validation

In [32]:
# df_filtered_invalid = df_filtered.query("url.isna()", engine="python")
# df_filtered_valid = df_filtered.query("url.notna()", engine="python")
# df_filtered_invalid.to_excel(f"data/filtered/df_invalid_{date_str}.xlsx")
# df_filtered_valid.to_excel(f"data/filtered/df_valid_{date_str}.xlsx")
# df_filtered_invalid.to_excel("data/df_filtered_invalid.xlsx")
# df_filtered_valid.to_excel("data/df_filtered_valid.xlsx")

In [86]:
df_all.query("isin==504084")

Unnamed: 0,symbol,isin,exchange,date,open,high,low,close,volume,year,month,day,ym,quarter
353,KAYCEE IND,504084,BSE,2021-08-06,4000.0,4170.00,3936.00,3938.40,34,2021,8,6,202108,3
351,KAYCEE IND,504084,BSE,2021-07-23,4393.6,4393.60,4151.00,4151.00,7,2021,7,23,202107,3
350,KAYCEE IND,504084,BSE,2021-08-31,3935.0,3935.00,3745.05,3745.05,11,2021,8,31,202108,3
356,KAYCEE IND,504084,BSE,2021-10-05,3703.0,4040.00,3701.00,4040.00,29,2021,10,5,202110,4
344,KAYCEE IND,504084,BSE,2020-12-17,2960.0,2960.00,2750.00,2750.00,11,2020,12,17,202012,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,KAYCEE IND,504084,BSE,2021-08-05,4075.0,4076.95,4050.00,4050.00,6,2021,8,5,202108,3
352,KAYCEE IND,504084,BSE,2021-09-29,3851.0,4044.00,3851.00,4044.00,4,2021,9,29,202109,3
358,KAYCEE IND,504084,BSE,2021-06-25,3849.9,3854.55,3849.90,3854.55,23,2021,6,25,202106,2
355,KAYCEE IND,504084,BSE,2021-05-17,3000.0,3044.00,2900.00,3030.00,29,2021,5,17,202105,2


In [87]:
x = ["ACCORD", "BDL", "EIDPARRY", "ELECTCAST", "SIS", "SRPL", "VINNY", "VIPCLOTHNG"]
df_all.query("symbol.isin(@x)", engine="python")[["symbol", "isin"]].drop_duplicates()

Unnamed: 0,symbol,isin
176,BDL,INE171Z01018
364,EIDPARRY,INE126A01031
369,ELECTCAST,INE086A01029
1246,SIS,INE285J01028
1474,VIPCLOTHNG,INE450G01024
1270,SRPL,INE008Z01012


In [92]:
#  .assign(value=lambda df: df.close * df.volume)
#     .sort_values(grouping_vars + ["month", "day"])
#     .groupby(grouping_vars)
#     .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
#     .reset_index()
#     .sort_values(grouping_vars)
#     .assign(
#         value_lag=lambda df: df.groupby(["symbol", "isin"])["value"].shift(1),
#         volume_lag=lambda df: df.groupby(["symbol", "isin"])["volume"].shift(1),
#         close_lag=lambda df: df.groupby(["symbol", "isin"])["close"].shift(1),
#         value_ratio=lambda df: df.value / df.value_lag,
#         volume_ratio=lambda df: df.volume / df.volume_lag,
#         close_ratio=lambda df: df.close / df.close_lag,
#     )

In [107]:
df_tmp = (
    df_all.query("isin==502901")
    .query("date>='2021-10-01'")
    .sort_values(["year", "quarter", "month", "day"])
    .assign(value=lambda df: df.close * df.volume)
    .groupby(["year", "quarter"])
    .agg({"value": sum, "volume": sum, "close": lambda x: x.iloc[-1]})
    .reset_index()
    .assign(value_last_close=lambda df: df.close * df.volume)
)

df_tmp

print("ratios")
df_tmp.iloc[1] / df_tmp.iloc[0]

Unnamed: 0,year,quarter,value,volume,close,value_last_close
0,2021,4,1448204.15,413,4016.15,1658669.95
1,2022,1,189562.0,44,4290.0,188760.0


ratios


year                1.000495
quarter             0.250000
value               0.130895
volume              0.106538
close               1.068187
value_last_close    0.113802
dtype: float64

In [108]:
87825600 / 71640000

1.225929648241206

In [109]:
df = df_all.query("isin==502901").query("year==2022 & quarter==1").sort_values("date")
df
"volume", f"{df.volume.sum():,}"
"last close", f"{df.close.iloc[-1]:,}"
"value", f"{df.close.iloc[-1]*df.volume.sum():,}"

Unnamed: 0,symbol,isin,exchange,date,open,high,low,close,volume,year,month,day,ym,quarter
320,JAMSHRI,502901,BSE,2022-01-03,3815.5,4200.0,3815.5,4190.0,14,2022,1,3,202201,1
322,JAMSHRI,502901,BSE,2022-01-10,4390.0,4390.0,4200.0,4370.0,5,2022,1,10,202201,1
317,JAMSHRI,502901,BSE,2022-01-12,4151.5,4580.0,4151.5,4580.0,12,2022,1,12,202201,1
316,JAMSHRI,502901,BSE,2022-01-14,4580.0,4580.0,4351.0,4351.0,2,2022,1,14,202201,1
322,JAMSHRI,502901,BSE,2022-01-17,4140.0,4140.0,4140.0,4140.0,2,2022,1,17,202201,1
315,JAMSHRI,502901,BSE,2022-01-18,3933.0,3950.0,3933.0,3950.0,3,2022,1,18,202201,1
314,JAMSHRI,502901,BSE,2022-01-19,4147.0,4147.0,4147.0,4147.0,2,2022,1,19,202201,1
314,JAMSHRI,502901,BSE,2022-01-21,4300.0,4300.0,4300.0,4300.0,1,2022,1,21,202201,1
320,JAMSHRI,502901,BSE,2022-01-24,4086.0,4086.0,4086.0,4086.0,1,2022,1,24,202201,1
314,JAMSHRI,502901,BSE,2022-01-27,4290.0,4290.0,4290.0,4290.0,2,2022,1,27,202201,1


('volume', '44')

('last close', '4,290.0')

('value', '188,760.0')