In [10]:
import pandas as pd
import numpy as np

In [11]:
def preprocess_nav_data(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df = df[df['NAV'] > 0]                      # remove invalid NAVs
    df = df.sort_values(['Scheme_Code', 'Date'])
    df = df.drop_duplicates(subset=['Scheme_Code', 'Date'])
    return df
def calculate_cagr(nav_series, years):
    """
    nav_series: pandas Series with Date as index and NAV as values
    years: number of years for CAGR
    """
    end_date = nav_series.index.max()
    start_date = end_date - pd.DateOffset(years=years)
    nav_series = nav_series[nav_series.index >= start_date]

    if len(nav_series) < 2:
        return np.nan

    nav_start = nav_series.iloc[0]
    nav_end = nav_series.iloc[-1]

    cagr = (nav_end / nav_start) ** (1 / years) - 1
    return cagr
def calculate_annualized_volatility(nav_series):
    returns = np.log(nav_series / nav_series.shift(1)).dropna()
    volatility = returns.std() * np.sqrt(252)
    return volatility
def calculate_sharpe_ratio(nav_series, risk_free_rate=0.06):
    returns = np.log(nav_series / nav_series.shift(1)).dropna()

    annual_return = returns.mean() * 252
    annual_volatility = returns.std() * np.sqrt(252)

    if annual_volatility == 0:
        return np.nan

    sharpe = (annual_return - risk_free_rate) / annual_volatility
    return sharpe
def calculate_max_drawdown(nav_series):
    rolling_max = nav_series.cummax()
    drawdown = (nav_series - rolling_max) / rolling_max
    max_drawdown = drawdown.min()
    return max_drawdown
def compute_fund_features(df):
    results = []

    for scheme, group in df.groupby('Scheme_Code'):
        nav_series = group.set_index('Date')['NAV']

        features = {
            'Scheme_Code': scheme,
            'CAGR_1Y': calculate_cagr(nav_series, 1),
            'CAGR_3Y': calculate_cagr(nav_series, 3),
            'CAGR_5Y': calculate_cagr(nav_series, 5),
            'Volatility': calculate_annualized_volatility(nav_series),
            'Sharpe_Ratio': calculate_sharpe_ratio(nav_series),
            'Max_Drawdown': calculate_max_drawdown(nav_series)
        }

        results.append(features)

    return pd.DataFrame(results)


In [12]:
# Load data
df_nav = pd.read_parquet("mutual_fund_nav_history.parquet")
df_nav = df_nav.reset_index()

# Preprocess
df_nav = preprocess_nav_data(df_nav)

# Compute features
fund_features_df = compute_fund_features(df_nav)

print(fund_features_df.head())

   Scheme_Code   CAGR_1Y   CAGR_3Y   CAGR_5Y  Volatility  Sharpe_Ratio  \
0       100033  0.029283  0.153772  0.128393    0.211249      0.228469   
1       100034 -0.033412  0.077517  0.057694    0.230014     -0.152078   
2       100037 -0.009447  0.000639 -0.007687    0.052591     -0.879165   
3       100038  0.052638  0.067249  0.052759    0.039747      0.466872   
4       100041  0.000000  0.000000 -0.000016    0.467013      0.074344   

   Max_Drawdown  
0     -0.672374  
1     -0.688546  
2     -0.140251  
3     -0.103922  
4     -0.002945  


calculated the features from raw time series data mutual fund nav history.

In [13]:
df_scheme = pd.read_csv("mutual_fund_data.csv")


In [14]:
df_scheme = df_scheme[
    [
        "Scheme_Code",
        "Scheme_Name",
        "AMC",
        "Scheme_Category",
        "Scheme_Min_Amt",
        "Launch_Date"
    ]
].copy()


In [15]:
def split_category(cat):
    if isinstance(cat, str) and "-" in cat:
        main, sub = cat.split("-", 1)
        return main.strip(), sub.strip()
    return cat, "Other"

df_scheme[['Category', 'Subcategory']] = df_scheme['Scheme_Category'].apply(
    lambda x: pd.Series(split_category(x))
)


In [16]:
df_scheme['Launch_Date'] = pd.to_datetime(df_scheme['Launch_Date'], errors='coerce')
today = pd.Timestamp.today()

df_scheme['Fund_Age_Years'] = (
    (today - df_scheme['Launch_Date']).dt.days / 365
)
# Convert 'Scheme_Min_Amt' to numeric, coercing errors, then handle 0s and NaNs
df_scheme['Scheme_Min_Amt'] = pd.to_numeric(df_scheme['Scheme_Min_Amt'], errors='coerce')
df_scheme['Scheme_Min_Amt'] = (
    df_scheme['Scheme_Min_Amt']
    .replace(0, np.nan) # Replace 0 with NaN for consistent handling
    .fillna(df_scheme['Scheme_Min_Amt'].median()) # Fill NaNs (from coerce and 0s) with the median
)


In [17]:
df_final = pd.merge(
    fund_features_df,
    df_scheme,
    on="Scheme_Code",
    how="inner"
)
final_columns = [
    "Scheme_Code",
    "Scheme_Name",
    "AMC",
    "Category",
    "Subcategory",
    "Scheme_Min_Amt",
    "Fund_Age_Years",
    "CAGR_1Y",
    "CAGR_3Y",
    "CAGR_5Y",
    "Volatility",
    "Sharpe_Ratio",
    "Max_Drawdown"
    # "Risk_Score", # These columns are not yet computed or merged
    # "Risk_Bucket" # and cause a KeyError. They are removed for now.
]

df_final = df_final[final_columns]

all the necessary features from mutal fund data and nav histtory together in final data frame .


In [18]:
print(df_final.head())

   Scheme_Code                                 Scheme_Name  \
0       100033  Aditya Birla Sun Life Large & Mid Cap Fund   
1       100034  Aditya Birla Sun Life Large & Mid Cap Fund   
2       100037           Aditya Birla Sun Life Income Fund   
3       100038           Aditya Birla Sun Life Income Fund   
4       100041           Aditya Birla Sun Life Liquid Fund   

                                 AMC       Category  \
0  Aditya Birla Sun Life AMC Limited  Equity Scheme   
1  Aditya Birla Sun Life AMC Limited  Equity Scheme   
2  Aditya Birla Sun Life AMC Limited    Debt Scheme   
3  Aditya Birla Sun Life AMC Limited    Debt Scheme   
4  Aditya Birla Sun Life AMC Limited    Debt Scheme   

                    Subcategory  Scheme_Min_Amt  Fund_Age_Years   CAGR_1Y  \
0          Large & Mid Cap Fund          5000.0       30.967123  0.029283   
1          Large & Mid Cap Fund          5000.0       30.967123 -0.033412   
2  Medium to Long Duration Fund          5000.0       32.115068 -

In [27]:
#one hot encoding of category and subcategory
from sklearn.preprocessing import StandardScaler

numeric_features = [
    'CAGR_1Y',
    'CAGR_3Y',
    'CAGR_5Y',
    'Volatility',
    'Sharpe_Ratio',
    'Max_Drawdown',
    'Fund_Age_Years'
]

categorical_features = ['Category', 'Subcategory']
category_ohe = pd.get_dummies(
    df_final[categorical_features],
    prefix=categorical_features
)
print(category_ohe.head())
category_scaled = category_ohe.values * 0.3   # weight factor
numeric_scaled = StandardScaler().fit_transform(
    df_final[numeric_features]
)

   Category_Debt Scheme  Category_ELSS  Category_Equity Scheme  Category_Gilt  \
0                 False          False                    True          False   
1                 False          False                    True          False   
2                  True          False                   False          False   
3                  True          False                   False          False   
4                  True          False                   False          False   

   Category_Growth  Category_Hybrid Scheme  Category_Income  Category_Liquid  \
0            False                   False            False            False   
1            False                   False            False            False   
2            False                   False            False            False   
3            False                   False            False            False   
4            False                   False            False            False   

   Category_Money Market  Catego

In [28]:
X = np.hstack([numeric_scaled, category_scaled])


In [35]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Fill NaN values in X before scaling

# Replace NaN values with the mean of each column
X_filled = np.where(np.isnan(X), np.nanmean(X, axis=0), X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_filled)

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X_scaled)


In [37]:
fund_index = pd.Series(
    df_final.index,
    index=df_final['Scheme_Name']
)
# If there are duplicate Scheme_Name entries, fund_index[fund_name] will return multiple indices.
# We need to ensure fund_index maps each fund name to a single, unique index.
# This keeps the first occurrence for each duplicate Scheme_Name.
fund_index = fund_index[~fund_index.index.duplicated(keep='first')]

In [38]:
def recommend_similar_funds(fund_name, top_n=5):
    idx = fund_index[fund_name]

    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(
        similarity_scores, key=lambda x: x[1], reverse=True
    )

    top_indices = [i[0] for i in similarity_scores[1:top_n+1]]

    return df_final.iloc[top_indices][
        [
            'Scheme_Name',
            'AMC',
            'Category',
            'Subcategory',
            'CAGR_3Y',
            'Sharpe_Ratio'
        ]
    ]


In [34]:
print(recommend_similar_funds("Aditya Birla Sun Life Large & Mid Cap Fund"))

                                   Scheme_Name  \
96    Nippon India Vision Large & Mid Cap Fund   
4494               HDFC Large and Mid Cap Fund   
1979  Nippon India Vision Large & Mid Cap Fund   
73       ICICI Prudential Large & Mid Cap Fund   
4496               HDFC Large and Mid Cap Fund   

                                                    AMC       Category  \
96           Nippon Life India Asset Management Limited  Equity Scheme   
4494              HDFC Asset Management Company Limited  Equity Scheme   
1979         Nippon Life India Asset Management Limited  Equity Scheme   
73    ICICI Prudential Asset Management Company Limited  Equity Scheme   
4496              HDFC Asset Management Company Limited  Equity Scheme   

               Subcategory   CAGR_3Y  Sharpe_Ratio  
96    Large & Mid Cap Fund  0.127465     -0.189836  
4494  Large & Mid Cap Fund  0.131050     -0.069856  
1979  Large & Mid Cap Fund  0.139710     -0.025820  
73    Large & Mid Cap Fund  0.112139     -

In [40]:
print(recommend_similar_funds("HDFC Large and Mid Cap Fund"))

                                   Scheme_Name  \
4495               HDFC Large and Mid Cap Fund   
2439                   SBI LARGE & MIDCAP FUND   
2440                   SBI LARGE & MIDCAP FUND   
1980  Nippon India Vision Large & Mid Cap Fund   
1978  Nippon India Vision Large & Mid Cap Fund   

                                             AMC       Category  \
4495       HDFC Asset Management Company Limited  Equity Scheme   
2439                SBI Funds Management Limited  Equity Scheme   
2440                SBI Funds Management Limited  Equity Scheme   
1980  Nippon Life India Asset Management Limited  Equity Scheme   
1978  Nippon Life India Asset Management Limited  Equity Scheme   

               Subcategory   CAGR_3Y  Sharpe_Ratio  
4495  Large & Mid Cap Fund  0.215482      0.452390  
2439  Large & Mid Cap Fund  0.190829      0.572457  
2440  Large & Mid Cap Fund  0.190833      0.693388  
1980  Large & Mid Cap Fund  0.220987      0.473371  
1978  Large & Mid Cap Fund  0.2