In [6]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as urllib2
from io import StringIO
import datetime as dt
import hgutils
import time
stopwatch = hgutils.timer("AMFI Data Scraper")
import threading
import concurrent.futures
import fastparquet
import pyarrow
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"

In [7]:
def date_range(start_date,end_date,frequency=1):
    current_date = start_date
    yield current_date
    current_date = current_date + dt.timedelta(days=frequency)
    while current_date<=end_date:
        yield current_date
        current_date = current_date + dt.timedelta(days=frequency)

def generate_amfi_url (date,base_url = 'https://portal.amfiindia.com/DownloadNAVHistoryReport_Po.aspx?frmdt='):
    date_str = date.strftime("%d-%b-%Y")
    return base_url+date_str

def get_day_df (url, break_minutes = 5, scheme_keyword = "Axis"):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page.read())
    df = pd.read_csv(StringIO(soup.get_text()),sep=';')
    df.dropna(subset='Net Asset Value',inplace=True)
    cols_to_keep = ['Scheme Code','Scheme Name',
        'Net Asset Value','ISIN Div Payout/ISIN Growth',
        'ISIN Div Reinvestment','Date']
    df = df[cols_to_keep]
    if scheme_keyword:
        df = df[df['Scheme Name'].str.contains(scheme_keyword)]
    return df

def get_day_df_recursive (url, break_minutes = 5):
    try:
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        df = pd.read_csv(StringIO(soup.get_text()),sep=';')
        df.dropna(subset='Net Asset Value',inplace=True)
    except:
        hgt.start("Break Time")
        time.sleep(break_minutes*60)
        df = get_day_df_recursive(url,break_minutes)
        hgt.stop()
    return df

def download_daily_amfi_data(date):
    df = get_day_df_recursive(generate_amfi_url(date),break_minutes=5)
    df.astype(str).to_parquet("amfi_data/"+date.strftime("%Y%b%d")+'.parquet')
    print(date.strftime("%Y%b%d"))

In [3]:
scraping = False

if scraping:
    start_date = dt.date(2009,1,1)
    end_date = dt.date(2022,8,24)
    stopwatch.start("read and parquet")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(download_daily_amfi_data,date_range(start_date,end_date))
    stopwatch.stop(print=True,verbose=0)

2009Jan03
2009Jan04
2009Jan08
2009Jan10
2009Jan11
2009Jan17
2009Jan18
2009Jan07
2009Jan06
2009Jan05
2009Jan02
2009Jan12
2009Jan13
2009Jan16
2009Jan09
2009Jan24
2009Jan152009Jan01

2009Jan14
2009Jan25
2009Jan26
2009Jan19
2009Jan31
2009Jan20
2009Feb01
2009Jan22
2009Jan23
2009Jan21
2009Jan27
2009Feb07
2009Jan29
2009Jan28
2009Jan30
2009Feb03
2009Feb08
2009Feb02
2009Feb04
2009Feb14
2009Feb06
2009Feb05
2009Feb15
2009Feb09
2009Feb21
2009Feb22
2009Feb10
2009Feb23
2009Feb12
2009Feb11
2009Feb20
2009Feb16
2009Feb13
2009Feb28
2009Mar01
2009Feb192009Feb18

2009Feb17
2009Feb24
2009Feb25
2009Feb26
2009Mar07
2009Feb27
2009Mar08
2009Mar10
2009Mar02
2009Mar11
2009Mar14
2009Mar03
2009Mar15
2009Mar05
2009Mar06
2009Mar21
2009Mar09
2009Mar04
2009Mar22
2009Mar12
2009Mar13
2009Mar19
2009Mar16
2009Mar17
2009Mar28
2009Mar29
2009Mar20
2009Mar18
2009Apr03
2009Apr04
2009Mar24
2009Mar27
2009Mar23
2009Apr05
2009Apr01
2009Mar25
2009Mar26
2009Apr11
2009Apr10
2009Apr07
2009Mar30
2009Mar31
2009Apr12
2009Apr14
2009Apr02


In [21]:
stopwatch.start("Reading parquet files")
df = pd.read_parquet('amfi_data/',use_nullable_dtypes=True, engine='pyarrow')
stopwatch.stop(print=True, verbose=0)

stopwatch.start("Filtering Direct and Growth options")
df = df[(df['Scheme Name'].str.contains("Direct"))&(df['Scheme Name'].str.contains("Growth"))]
stopwatch.stop(print=True)

stopwatch.start("Converting to Data types")
df['Date'] = pd.to_datetime(df['Date'])
# df['Net Asset Value'] = df['Net Asset Value'].astype(float)
stopwatch.stop()
stopwatch.print(verbose=2)
df.head(3)

		 Reading parquet files - 2.77 minutes
		 Filtering Direct and Growth options - 2.46 minutes
 AMFI Data Scraper - 227.73 minutes*
	 Reading parquet files - 3.49 minutes
	 Converting to Data types - 224.1 minutes*
		 Reading parquet files - 4.23 minutes
		 Converting to Data types - 13.67 seconds
		 Saving to CSV - 10.44 seconds
		 Saving to CSV - 9.95 seconds
		 Saving to CSV - 11.28 seconds
		 Reading parquet files - 2.77 minutes
		 Filtering Direct and Growth options - 2.46 minutes
		 Converting to Data types - 1.82 seconds


Unnamed: 0,Scheme Code,Scheme Name,ISIN Div Payout/ISIN Growth,ISIN Div Reinvestment,Net Asset Value,Repurchase Price,Sale Price,Date
915,103490,Quantum Long Term Equity Value Fund - Direct P...,INF082J01036,,9.4,9.02,9.4,2009-04-01
1156,111549,Quantum Tax Saving Fund - Direct Plan Growth O...,INF082J01069,,10.06,10.06,10.06,2009-04-01
1358,103734,Quantum Liquid Fund - Direct Plan Growth Option,INF082J01127,,12.5482,12.5482,12.5482,2009-04-01


In [22]:
stopwatch.start("Saving to CSV")
# df[['Scheme Name','Date']].groupby('Scheme Name', as_index=False).agg(['min', 'max']).reset_index().head(3)
df[['Scheme Name','Date']].groupby('Scheme Name', as_index=False).agg(['min', 'max']).reset_index().to_csv("schemes.csv",index=False)
stopwatch.stop(print=True)

		 Saving to CSV - 11.43 seconds


In [24]:
df.tail()

Unnamed: 0,Scheme Code,Scheme Name,ISIN Div Payout/ISIN Growth,ISIN Div Reinvestment,Net Asset Value,Repurchase Price,Sale Price,Date
7928,118598,Nippon India Monthly Interval Fund - Series I ...,INF204K01ZA5,-,28.1239,,,2022-05-31
7932,118595,Nippon India Monthly Interval Fund - Series II...,INF204K01ZD9,-,28.1624,,,2022-05-31
7936,118691,Nippon India Quarterly Interval Fund - Series ...,INF204K01F61,-,29.7334,,,2022-05-31
7938,118623,Nippon India Interval Fund - Quarterly Plan - ...,INF204K01XT0,-,28.5751,,,2022-05-31
7945,118583,Nippon India Quarterly Interval Fund - Series ...,INF204K01XW4,-,17.4242,,,2022-05-31
