In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as urllib2
from io import StringIO
import datetime as dt
import hgutils
import time
stopwatch = hgutils.timer("AMFI Data Scraper")
import threading
import concurrent.futures
import fastparquet
import pyarrow
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"

import warnings
warnings.filterwarnings('ignore')

In [2]:
def date_range(start_date,end_date,frequency=1):
    current_date = start_date
    yield current_date
    current_date = current_date + dt.timedelta(days=frequency)
    while current_date<=end_date:
        yield current_date
        current_date = current_date + dt.timedelta(days=frequency)

def generate_amfi_url (date,base_url = 'https://portal.amfiindia.com/DownloadNAVHistoryReport_Po.aspx?frmdt='):
    date_str = date.strftime("%d-%b-%Y")
    return base_url+date_str

def get_day_df_recursive (url, break_minutes = 5):
    try:
        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page.read())
        df = pd.read_csv(StringIO(soup.get_text()),sep=';')

        df['Scheme'] = df['Scheme Code']
        df['AMC'] = df['Scheme Code']

        df.loc[~df['Scheme Code'].str.contains("Schemes"),"Scheme"] = None
        df.loc[~df['Net Asset Value'].isnull(),"AMC"] = None

        df[['Scheme','AMC']] = df[['Scheme','AMC']].fillna(method='ffill')

        df['Scheme Type'] = df['Scheme'].str.split(pat="(",expand=True)[1].str.replace(")","").str.strip()
        df['Scheme'] = df['Scheme'].str.split(pat="(",expand=True)[0].str.strip()   

        df.dropna(subset='Net Asset Value',inplace=True)
    except:
        stopwatch.start("Break Time")
        time.sleep(break_minutes*60)
        df = get_day_df_recursive(url,break_minutes)
        stopwatch.stop()
    return df

def download_daily_amfi_data(date):
    df = get_day_df_recursive(generate_amfi_url(date),break_minutes=5)
    df.astype(str).to_parquet("amfi_data/"+date.strftime("%Y%b%d")+'.parquet')
    print(date.strftime("%Y%b%d"))

In [3]:
my_date = dt.date(2022,8,24)
df = get_day_df_recursive(generate_amfi_url(my_date),break_minutes=5)
df.head()

Unnamed: 0,Scheme Code,Scheme Name,ISIN Div Payout/ISIN Growth,ISIN Div Reinvestment,Net Asset Value,Repurchase Price,Sale Price,Date,Scheme,AMC,Scheme Type
2,139619,Taurus Investor Education Pool - Unclaimed Div...,,,10.0,,,24-Aug-2022,Open Ended Schemes,Taurus Mutual Fund,Money Market
3,139618,Taurus Investor Education Pool - Unclaimed Red...,,,10.0,,,24-Aug-2022,Open Ended Schemes,Taurus Mutual Fund,Money Market
4,139616,Taurus Unclaimed Dividend - Growth,,,14.5066,,,24-Aug-2022,Open Ended Schemes,Taurus Mutual Fund,Money Market
5,139617,Taurus Unclaimed Redemption - Growth,,,14.5079,,,24-Aug-2022,Open Ended Schemes,Taurus Mutual Fund,Money Market
8,148921,Aditya Birla Sun Life Multi-Cap Fund-Direct Gr...,INF209KB1Y49,,12.5,,,24-Aug-2022,Open Ended Schemes,Aditya Birla Sun Life Mutual Fund,Equity Scheme - Multi Cap Fund


In [4]:
scraping = False

if scraping:
    start_date = dt.date(2009,1,1)
    end_date = dt.date(2022,10,8)
    stopwatch.start("read and parquet")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(download_daily_amfi_data,date_range(start_date,end_date))
    stopwatch.stop(print=True,verbose=0)

In [5]:
stopwatch.start("Reading parquet files")
df = pd.read_parquet('amfi_data/',use_nullable_dtypes=True, engine='pyarrow')
stopwatch.stop(print=True, verbose=0)

# stopwatch.start("Filtering Direct and Growth options")
# df = df[(df['Scheme Name'].str.contains("Direct"))&(df['Scheme Name'].str.contains("Growth"))]
# stopwatch.stop(print=True)

stopwatch.start("Converting to Data types")
df['Date'] = pd.to_datetime(df['Date'])
# df['Net Asset Value'] = df['Net Asset Value'].astype(float)
stopwatch.stop()
stopwatch.print(verbose=2)
df.head(3)

	 Reading parquet files - 1.02 minutes
 AMFI Data Scraper - 3.49 minutes*
	 Reading parquet files - 1.02 minutes
	 Converting to Data types - 6.12 seconds


Unnamed: 0,Scheme Code,Scheme Name,ISIN Div Payout/ISIN Growth,ISIN Div Reinvestment,Net Asset Value,Repurchase Price,Sale Price,Date,Scheme,AMC,Scheme Type
2,110147,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BG4,,10.032,10.032,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income
3,110146,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BH2,,10.2865,10.2865,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income
4,110148,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BI0,,10.0265,10.0265,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income


In [20]:
stopwatch.start("Saving to CSV")
groupby_cols = ['AMC','Scheme Name','Scheme','Scheme Type']
# df[['Scheme Name','Date']].groupby('Scheme Name', as_index=False).agg(['min', 'max']).reset_index().head(3)
adf[groupby_cols+['Date']].groupby(groupby_cols, as_index=False).agg(['min', 'max']).reset_index().to_csv("schemes.csv",index=False)
stopwatch.stop(print=True)

	 Saving to CSV - 17.3 seconds


In [16]:
keep_scheme_type_df = pd.read_csv("Scheme Types.csv")
adf = df.merge(keep_scheme_type_df.query("keep_flag==1"),on=['Scheme','Scheme Type'])
adf.head()

Unnamed: 0,Scheme Code,Scheme Name,ISIN Div Payout/ISIN Growth,ISIN Div Reinvestment,Net Asset Value,Repurchase Price,Sale Price,Date,Scheme,AMC,Scheme Type,keep_flag
0,110147,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BG4,,10.032,10.032,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income,1
1,110146,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BH2,,10.2865,10.2865,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income,1
2,110148,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BI0,,10.0265,10.0265,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income,1
3,110145,Canara Robeco Interval Scheme Series2 (Quarter...,INF760K01BJ8,,10.4495,10.4495,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income,1
4,109931,Canara Robeco Interval Scheme- Series2 (Quarte...,,,10.0688,10.0688,0.0,2009-04-01,Open Ended Schemes,Canara Robeco Mutual Fund,Income,1


In [18]:
a = ['a','b']
k = 'h'
a + [k] 

['a', 'b', 'h']