In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as urllib2
from io import StringIO
import datetime as dt
import hgutils
import time
stopwatch = hgutils.timer("AMFI Data Scraper")
import threading
import concurrent.futures
import fastparquet
import pyarrow
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"

import warnings
warnings.filterwarnings('ignore')

In [2]:
def date_range(start_date,end_date,frequency=1):
    current_date = start_date
    yield current_date
    current_date = current_date + dt.timedelta(days=frequency)
    while current_date<=end_date:
        yield current_date
        current_date = current_date + dt.timedelta(days=frequency)

def generate_amfi_url (date,base_url = 'https://portal.amfiindia.com/DownloadNAVHistoryReport_Po.aspx?frmdt='):
    date_str = date.strftime("%d-%b-%Y")
    return base_url+date_str

def get_day_df_recursive (url, break_minutes = 5, selected_schemes = None):
    try:
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        df = pd.read_csv(StringIO(soup.get_text()),sep=';')

        df['Scheme'] = df['Scheme Code']
        df['AMC'] = df['Scheme Code']

        df.loc[~df['Scheme Code'].str.contains("Schemes"),"Scheme"] = None
        df.loc[~df['Net Asset Value'].isnull(),"AMC"] = None

        df[['Scheme','AMC']] = df[['Scheme','AMC']].fillna(method='ffill')

        df['Scheme Type'] = df['Scheme'].str.split(pat="(",expand=True)[1].str.replace(")","").str.strip()
        df['Scheme'] = df['Scheme'].str.split(pat="(",expand=True)[0].str.strip()   

        df.dropna(subset='Scheme Name',inplace=True)

        if selected_schemes is not None:
            df = df[df['Scheme Name'].isin(selected_schemes)]

        
    except:
        stopwatch.start("Break Time")
        time.sleep(break_minutes*60)
        df = get_day_df_recursive(url,break_minutes)
        stopwatch.stop()
    return df

def download_daily_amfi_data(date,selected_schemes=None):
    df = get_day_df_recursive(generate_amfi_url(date),break_minutes=5,selected_schemes=required_schemes)
    df.astype(str).to_parquet("amfi_data_hdfc/"+date.strftime("%Y%b%d")+'.parquet')
    if date.day == 1:
        print(date.strftime("%Y%b%d"))

In [3]:
# my_date = dt.date(2022,8,24)

# scheme = pd.read_excel("schemes.xlsx",sheet_name='Analysis')
# required_schemes = list(scheme['Scheme Name'].unique())

# df = get_day_df_recursive(generate_amfi_url(my_date),break_minutes=5, selected_schemes=required_schemes)
# df.head(2)

In [4]:
scraping = False

scheme = pd.read_excel("schemes.xlsx",sheet_name='Analysis')
required_schemes = list(scheme['Scheme Name'].unique())

if scraping:
    start_date = dt.date(2010,8,1)
    end_date = dt.date(2022,10,8)
    stopwatch.start("read and parquet")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(download_daily_amfi_data,date_range(start_date,end_date))
    stopwatch.stop(print=True,verbose=0)

In [5]:
stopwatch.start("Reading parquet files")
df = pd.read_parquet('amfi_data/',use_nullable_dtypes=True, engine='pyarrow')
stopwatch.stop(print=True, verbose=0)

stopwatch.start("Only required schemes")
scheme = pd.read_excel("schemes.xlsx",sheet_name='Analysis')
required_schemes = list(scheme['Scheme Name'].unique())
df = df[df['Scheme Name'].isin(required_schemes)]
stopwatch.stop()

stopwatch.start("Converting to Data types")
df['Date'] = pd.to_datetime(df['Date'])
# df['Net Asset Value'] = df['Net Asset Value'].astype(float)
stopwatch.stop()

stopwatch.print(verbose=2)
df.head(3)

	 Reading parquet files - 1.07 minutes
 AMFI Data Scraper - 2.5 minutes*
	 Reading parquet files - 1.07 minutes
	 Only required schemes - 56.87 seconds
	 Converting to Data types - 0.89 seconds


Unnamed: 0,Scheme Code,Scheme Name,ISIN Div Payout/ISIN Growth,ISIN Div Reinvestment,Net Asset Value,Repurchase Price,Sale Price,Date,Scheme,AMC,Scheme Type
3330,113049,HDFC Gold Exchange Traded Fund. - Growth Option,INF179KC1981,,1868.9673,1868.9673,1868.9673,2010-08-16,Open Ended Schemes,HDFC Mutual Fund,Other Scheme - Gold ETF
3330,113049,HDFC Gold Exchange Traded Fund. - Growth Option,INF179KC1981,,1871.3277,1871.3277,1871.3277,2010-08-17,Open Ended Schemes,HDFC Mutual Fund,Other Scheme - Gold ETF
3337,113049,HDFC Gold Exchange Traded Fund. - Growth Option,INF179KC1981,,1864.9274,1864.9274,1864.9274,2010-08-18,Open Ended Schemes,HDFC Mutual Fund,Other Scheme - Gold ETF


In [None]:
s = required_schemes[0]
all_schemes_dates = pd.DataFrame()
for s in required_schemes:
    start_date = df[df['Scheme Name'] == s]['Date'].min()
    end_date = df[df['Scheme Name'] == s]['Date'].max()
    period = (end_date-start_date).days
    all_dates = pd.DataFrame(index=pd.date_range(dt.datetime(2022,1,1), periods=100))
    all_dates['Scheme Name'] = s
    all_dates = (all_dates
    .reset_index()
    .rename(columns={'index':'Date'}))
    all_schemes_dates = pd.concat([all_schemes_dates,all_dates])

(all_schemes_dates
.merge(df,on=['Date','Scheme Name'],
    indicator=True,
    how='left')
.query('''_merge=="left_only"''')
[['Date','Scheme Name']]
.drop_duplicates()
.to_csv("Missing Dates.csv"))

In [13]:
df = pd.read_csv("Missing Dates.csv")
df.drop(columns='Unnamed: 0',inplace=True)
df['Date'] = pd.to_datetime(df['Date'])
df['dow'] = df['Date'].dt.dayofweek
df.head(2)

Unnamed: 0,Date,Scheme Name
0,2022-01-01,HDFC Banking and PSU Debt Fund - Growth Option...
1,2022-01-02,HDFC Banking and PSU Debt Fund - Growth Option...
