In [21]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as urllib2
from io import StringIO
import datetime as dt
import hgutils
import time
stopwatch = hgutils.timer("AMFI Data Scraper")
import threading
import concurrent.futures
import fastparquet
import pyarrow
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"

import warnings
warnings.filterwarnings('ignore')

In [22]:
def date_range(start_date,end_date,frequency=1):
    current_date = start_date
    yield current_date
    current_date = current_date + dt.timedelta(days=frequency)
    while current_date<=end_date:
        yield current_date
        current_date = current_date + dt.timedelta(days=frequency)

def generate_amfi_url (date,base_url = 'https://portal.amfiindia.com/DownloadNAVHistoryReport_Po.aspx?frmdt='):
    date_str = date.strftime("%d-%b-%Y")
    return base_url+date_str

def get_day_df_recursive (url, break_minutes = 5, selected_schemes = None):
    try:
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        df = pd.read_csv(StringIO(soup.get_text()),sep=';')

        df['Scheme'] = df['Scheme Code']
        df['AMC'] = df['Scheme Code']

        df.loc[~df['Scheme Code'].str.contains("Schemes"),"Scheme"] = None
        df.loc[~df['Net Asset Value'].isnull(),"AMC"] = None

        df[['Scheme','AMC']] = df[['Scheme','AMC']].fillna(method='ffill')

        df['Scheme Type'] = df['Scheme'].str.split(pat="(",expand=True)[1].str.replace(")","").str.strip()
        df['Scheme'] = df['Scheme'].str.split(pat="(",expand=True)[0].str.strip()   

        df.dropna(subset='Scheme Name',inplace=True)

        if selected_schemes is not None:
            df = df[df['Scheme Name'].isin(selected_schemes)]

        
    except:
        stopwatch.start("Break Time")
        time.sleep(break_minutes*60)
        df = get_day_df_recursive(url,break_minutes)
        stopwatch.stop()
    return df

def download_daily_amfi_data(date,selected_schemes=None):
    df = get_day_df_recursive(generate_amfi_url(date),break_minutes=5,selected_schemes=required_schemes)
    df.astype(str).to_parquet("amfi_data_hdfc/"+date.strftime("%Y%b%d")+'.parquet')
    if date.day == 1:
        print(date.strftime("%Y%b%d"))

In [23]:
stopwatch.start("Reading parquet files")
df = pd.read_parquet('amfi_data/',use_nullable_dtypes=True, engine='pyarrow')
stopwatch.stop(print=True, verbose=0)

stopwatch.start("Converting to Data types")
df['Date'] = pd.to_datetime(df['Date'])
df['Net Asset Value'] = (df['Net Asset Value']
        .replace("N.A.",None)
        .replace(",","",regex=True)
        .astype(float))
df['Scheme Code'] = df['Scheme Code'].astype(str)
stopwatch.stop()

df.head(3)

	 Reading parquet files - 2.8 minutes


Unnamed: 0,Scheme Code,Scheme Name,ISIN Div Payout/ISIN Growth,ISIN Div Reinvestment,Net Asset Value,Repurchase Price,Sale Price,Date,Scheme,AMC,Scheme Type
2,110147,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BG4,,10.032,10.032,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income
3,110146,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BH2,,10.2865,10.2865,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income
4,110148,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BI0,,10.0265,10.0265,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income


In [24]:
stopwatch.start("Preparing Date summary")
# df['startDate'] = df['Date']
# df['endDate'] = df['Date']
# (df
# .groupby(['Scheme Code','Scheme Name'],as_index=False)
# .agg({'startDate':'min','endDate':'max'})
# .to_csv("all_funds_dates.csv",index=False))

stopwatch.stop()

In [25]:
stopwatch.start("Preparing Fund Summary")

stopwatch.start("Sorting Values")
df.sort_values(by=['Scheme Code','Date'],inplace=True)
stopwatch.stop()

stopwatch.start("Preparing Additional Columns")
df['startDate'] = df['Date']
df['endDate'] = df['Date']

df['next_date'] = df['Date'].shift(periods=-1)
df['day_delta'] = (df['next_date'] - df['Date']).dt.days

df['next_NAV'] = df['Net Asset Value'].shift(periods=-1)
df['NAV_delta'] = df['next_NAV'] - df['Net Asset Value']
df['NAV_pct_delta'] = (df['NAV_delta'] / df['Net Asset Value']).fillna(0)

df['startDate'] = df.groupby(['Scheme Code','Scheme Name'])['Date'].transform('min')
df['endDate'] = df.groupby(['Scheme Code','Scheme Name'])['Date'].transform('max')
stopwatch.stop()

stopwatch.start("Filtering first and last record")
df = df[(df['Date']!=df['startDate'])&(df['Date']!=df['endDate'])]
stopwatch.stop()

stopwatch.start("Preparing Grouped df")
(df
.groupby(['AMC','Scheme Type','Scheme Code','Scheme Name'],as_index=False)
.agg({'startDate':'min','endDate':'max','day_delta':'max','NAV_pct_delta':'max'})
.to_csv("fund_summary.csv",index=False))
stopwatch.stop()

stopwatch.stop()

stopwatch.print(verbose=2)

 AMFI Data Scraper - 6.89 minutes*
	 Reading parquet files - 2.8 minutes
	 Converting to Data types - 33.96 seconds
	 Preparing Date summary - 0.03 milliseconds
	 Preparing Fund Summary - 3.52 minutes
		 Sorting Values - 1.17 minutes
		 Preparing Additional Columns - 25.66 seconds
		 Filtering first and last record - 1.45 minutes
		 Preparing Grouped df - 28.3 seconds
