# Lecture 1.3 : Aggregate information across share classes
**From:**
- *CRSP_FUNDNO* = sub-fund (share class)
- *CALDT* = calendar date at quarter end

**To:**
- *CRSP_CL_GRP* =fund of interest
- *CALDT* = calendar date at quarter end

## Objectives :
- Create sensible mapping of funds to unique IDs
- Fix outliers of TNA measures
- Aggregate information across share classes

## Import Settings

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
import Functions.Utilis
from settings import *

## Load Files

In [None]:
print('Loading needed files...')
fund_summary_ActiveEq = pd.read_csv(outputPath + fund_summary_ActiveEq_name)
fund_port_map = pd.read_csv(inputPath + map_fundno_portno_name)
monthly_return = pd.read_csv(inputPath + monthly_return_name)
monthly_return = monthly_return[['crsp_fundno', 'caldt', 'mtna']]
print('Loading complete!')

## Convert Data to Correct Types

In [None]:
print('Converting columns to the correct data types...')
t0 = time.time()
# fund_port_map
fund_port_map['begdt'] = pd.to_datetime(fund_port_map['begdt'])
fund_port_map['begdt'] = fund_port_map['begdt'] .apply(lambda x: x + relativedelta(day=31))
fund_port_map['enddt'] = pd.to_datetime(fund_port_map['enddt'])
fund_port_map['enddt'] = fund_port_map['enddt'] .apply(lambda x: x + relativedelta(day=31))
# fund_summary_ActiveEq
col_dt_list = [x for x in fund_summary_ActiveEq if x.endswith('dt')]
for col in col_dt_list:
    fund_summary_ActiveEq[col] = pd.to_datetime(fund_summary_ActiveEq[col])
    fund_summary_ActiveEq[col] = fund_summary_ActiveEq[col].apply(lambda x: x + relativedelta(day=31))
# monthly_return
monthly_return['caldt'] = pd.to_datetime(monthly_return['caldt'])
monthly_return['caldt'] = monthly_return['caldt'].apply(lambda x: x + relativedelta(day=31))
monthly_return.rename(columns={'caldt': 'monthend'}, inplace=True)
monthly_return.mtna = monthly_return.mtna.apply(lambda x: np.nan if (x == 'T') or (float(x) < 0) else x)
monthly_return.mtna = monthly_return.mtna.apply(float)
t1 = time.time()
print('Columns converted in:', t1-t0, 's')

# Keep if caldt > Dec 31st 1989

In [None]:
fund_summary_ActiveEq = fund_summary_ActiveEq.loc[fund_summary_ActiveEq['caldt'] >= dt.datetime(1989, 12, 31)]
print('The shape of fund_summary_ActiveEq is: ', str(fund_summary_ActiveEq.shape))
print('The minimum caldt for fund_summary_ActiveEq is: ', fund_summary_ActiveEq.caldt.min())

# Correct wficn for specific fundno
Manual correction:

fundno 53319 has wficn --> 107336

fundno 64229 has wficn --> 400136

fundno in [24644, 24645, 24646, 24647, 24648, 40851, 42067, 43344, 46514, 58716, 58717, 58718] --> wficn 501561

In [None]:
print('Correcting wficn for specific fundnos...')
fund_summary_ActiveEq.loc[fund_summary_ActiveEq.crsp_fundno == 53319, 'wficn'] = 107336
fund_summary_ActiveEq.loc[fund_summary_ActiveEq.crsp_fundno == 64229, 'wficn'] = 400136
fund_summary_ActiveEq.loc[fund_summary_ActiveEq.crsp_fundno.isin([24644, 24645, 24646, 24647, 24648, 40851, 42067,
                                                                  43344, 46514, 58716, 58717, 58718]), 'wficn'] = 501561
print("Observations corrected!")

## Create unique Group ID mapping between CRSP_CL_GRP and WFICN

In [None]:
print('Keeping observations for which both crsp_cl_grp & wficn are present...')
df = fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.wficn.notnull()) &
                               (fund_summary_ActiveEq.crsp_cl_grp.notnull()), ]

## UNIQUE MAP FROM CRSP_CL_GRP TO WFICN
### for each crsp_fundno return a list of all unique wficn into a dictionary

In [None]:
fundno_wficn_dict = df.groupby('crsp_fundno')['wficn'].apply(np.unique).apply(lambda x: x[0]).to_dict()

### Return a dataframe that associates to each crsp_cl_group all unique crsp_fundnos

In [None]:
df2 = df.groupby(['crsp_cl_grp'])['crsp_fundno'].apply(np.unique).reset_index()

### Add a column to the dataframe that uses the dictionary to match the wficns relative to the crsp_fundnos

In [None]:
df2['wficn'] = df2.crsp_fundno.apply(lambda x: np.unique([fundno_wficn_dict[idx] for idx in x]))

### Keep the most frequent wficn when multiple are present

In [None]:
df2.wficn = df2.crsp_fundno.apply(lambda x: stats.mode([fundno_wficn_dict[idx] for idx in x])[0][0])

### Keep the observations with unique mapping

In [None]:
unique_groupid_mapping1 = df2[['crsp_cl_grp', 'wficn']]

## UNIQUE MAP FROM WFICN TO CRSP_CL_GRP

In [None]:
# for each wficn it returns a list of all unique crsp_fundno into a dictionary
fundno_crspclgrp_dict = df.groupby('crsp_fundno')['crsp_cl_grp'].apply(np.unique).apply(lambda x: x[0]).to_dict()
# return a dataframe that associates to each wficn all unique crsp_fundnos
df3 = df.groupby(['wficn'])['crsp_fundno'].apply(np.unique).reset_index()
# add a column to the dataframe that uses the dictionary to match the crsp_cl_grps relative to the crsp_fundnos
df3['crsp_cl_grp'] = df3.crsp_fundno.apply(lambda x: np.unique([fundno_crspclgrp_dict[idx] for idx in x]))
# show example of crsp_cl_grp problem to be fixed (wrong naming convention)
b = fund_summary_ActiveEq[colList].loc[(fund_summary_ActiveEq.crsp_cl_grp == 2003338) |
                                       (fund_summary_ActiveEq.crsp_cl_grp == 2017461)].sort_values(['crsp_cl_grp',
                                                                                                    'caldt'])
# keep the most frequent crsp_cl_grp when multiple are present (reproduce Excel file with examples of the problem)
df3.crsp_cl_grp = df3.crsp_fundno.apply(lambda x: stats.mode([fundno_crspclgrp_dict[idx] for idx in x])[0][0])
# keep the observations with unique mapping
unique_groupid_mapping2 = df3[['crsp_cl_grp', 'wficn']]
print('Map completed!')

## Create a unique two way map between crsp_cl_grp and wficn

In [None]:
# this eliminates 291 combinations (11 from wficn and 280 from crsp_cl_grp) that are wrongfully assigned due
# to the naming conventions displayed above
unique_groupid_mapping = pd.merge(unique_groupid_mapping1, unique_groupid_mapping2, how='inner',
                                  on=['crsp_cl_grp', 'wficn'])

## Create unique mapping between crsp_cl_grp and wficn for funds that only existed before 1999 (no crsp_cl_grp assigned)

In [None]:
wficn_before = np.unique(fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_cl_grp.isnull()) &
                                                   (fund_summary_ActiveEq.wficn.notnull()) &
                                                   (~fund_summary_ActiveEq.wficn.isin(unique_groupid_mapping.wficn)),
                                                   'wficn'])
groupid_mapping_before = pd.DataFrame({'wficn': wficn_before,
                                       'crsp_cl_grp': np.arange(1000001, 1000001+len(wficn_before))})


## Generate dictionary

In [None]:
unique_groupid_mapping = pd.concat([groupid_mapping_before, unique_groupid_mapping], axis=0, sort=False)
wficn_crsp_cl_grp_dict = pd.Series(unique_groupid_mapping.crsp_cl_grp.values,
                                   index=unique_groupid_mapping.wficn).to_dict()
crsp_cl_grp_wficn_dict = pd.Series(unique_groupid_mapping.wficn.values,
                                   index=unique_groupid_mapping.crsp_cl_grp).to_dict()

# Fill missing wficn using the mapped crsp_cl_grp & viceversa

In [None]:
print('Before there are:')
print("- ", fund_summary_ActiveEq.loc[fund_summary_ActiveEq.crsp_cl_grp.isnull()].shape[0], 'missing crsp_cl_grp')
print("-- Of which ", fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_cl_grp.isnull()) &
                                                (fund_summary_ActiveEq.caldt < dt.datetime(1998, 1, 1))].shape[0],
      "are before 1998")
print("- ", fund_summary_ActiveEq.loc[fund_summary_ActiveEq.wficn.isnull()].shape[0], 'missing wficn')
print("-- Of which ", fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.wficn.isnull()) &
                                                (fund_summary_ActiveEq.caldt < dt.datetime(2008, 6, 1))].shape[0],
      "are before June 2008")
fund_summary_ActiveEq.loc[fund_summary_ActiveEq.wficn.isnull(), 'wficn'] = \
    fund_summary_ActiveEq.loc[fund_summary_ActiveEq.wficn.isnull(), 'crsp_cl_grp'].apply(Util.crsp_cl_grp2wficn,
                                                                                         d=wficn_crsp_cl_grp_dict)

## Fill in missing crsp_cl_grp and Correct crsp_cl_grp where different CRSP_CL_GRPs assigned to the same fund


In [None]:
fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_cl_grp.isnull()) |
                          (fund_summary_ActiveEq.wficn.notnull()), 'crsp_cl_grp'] = \
    fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_cl_grp.isnull()) | (fund_summary_ActiveEq.wficn.notnull()),
                              'wficn'].apply(Util.wficn2crsp_cl_grp, d=wficn_crsp_cl_grp_dict)
print('After filling there are:')
print("- ", fund_summary_ActiveEq.loc[fund_summary_ActiveEq.crsp_cl_grp.isnull()].shape[0], 'missing crsp_cl_grp')
print("-- Of which ", fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_cl_grp.isnull()) &
                                                (fund_summary_ActiveEq.caldt < dt.datetime(1998, 1, 1))].shape[0],
      "are before 1998")
print("- ", fund_summary_ActiveEq.loc[fund_summary_ActiveEq.wficn.isnull()].shape[0], 'missing wficn')
print("-- Of which ", fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.wficn.isnull()) &
                                                (fund_summary_ActiveEq.caldt < dt.datetime(2008, 9, 1))].shape[0],
      "are before September 2008")

# Use the Fund-Portfolio Map dataset to fill CRSP_PORTNOS

## Find cases for which crsp_portno does not match between our file and the fund map for the appropriate date ranges


In [None]:
fund_port_map.rename(columns={'crsp_portno': 'crsp_portno_map'}, inplace=True)
test = pd.merge(fund_summary_ActiveEq, fund_port_map, how='left', on=['crsp_fundno'])
test = test[((test.begdt < test.caldt) & (test.enddt > test.caldt))]
test = test.loc[test.crsp_portno != test.crsp_portno_map]
print('The number of missing crsp_portnos is distributed by date as follows:')
print(test[test.crsp_portno.isnull()].caldt.value_counts())
print('The number of wrong crsp_portnos is distributed by date as follows:')
print(test[~test.crsp_portno.isnull()].caldt.value_counts())
print('Before: the number of missing crsp_portno is:', sum(fund_summary_ActiveEq.crsp_portno.isnull()))
print("-- Of which ", fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_portno.isnull()) &
                                                (fund_summary_ActiveEq.caldt >= dt.datetime(2008, 6, 1))].shape[0],
      "are after May 2008")
test = test[['caldt', 'crsp_fundno', 'crsp_portno_map']].reset_index(drop=True)

## Substitute the missing/wrong crsp_portnos in our file with the values in the map

In [None]:
print('Before: the number of missing crsp_portno is', sum(fund_summary_ActiveEq.crsp_portno.isnull()))
fund_summary_ActiveEq = pd.merge(fund_summary_ActiveEq, test, how='left', on=['crsp_fundno', 'caldt'])
fund_summary_ActiveEq.loc[~fund_summary_ActiveEq['crsp_portno_map'].isnull(), 'crsp_portno'] = \
    fund_summary_ActiveEq['crsp_portno_map']
fund_summary_ActiveEq = fund_summary_ActiveEq.drop(columns=['crsp_portno_map'])
print('After: the number of missing crsp_portno is', sum(fund_summary_ActiveEq.crsp_portno.isnull()))
print("-- Of which ", fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_portno.isnull()) &
                                                (fund_summary_ActiveEq.caldt >= dt.datetime(2008, 6, 1))].shape[0],
      "are after May 2008")

# Deal with multiple CRSP_PORTNOs per CRSP_CL_GRP-CALDT
- Keep the most frequent portno (In the homework you will explore better methods) 

### Count the occurrences of multiple portnos

In [None]:
num_portno = fund_summary_ActiveEq.groupby(['crsp_cl_grp', 'caldt'])['crsp_portno'].nunique()
mult_portno = num_portno[num_portno > 1]
multi = mult_portno.reset_index()[['crsp_cl_grp', 'caldt']]
multi = pd.merge(multi, fund_summary_ActiveEq, how='left', on=['crsp_cl_grp', 'caldt'])
multiCounter = multi.groupby(['crsp_cl_grp', 'caldt'])['crsp_portno'].apply(list).reset_index()

### pick the most frequent crsp_portno per grp/date

In [None]:
multiCounter.crsp_portno = multiCounter.crsp_portno.apply(lambda x: stats.mode(x)[0][0])
multiCounter.crsp_portno = [i if i > 0 else np.nan for i in multiCounter.crsp_portno]

## Make changes to the dataframe

In [None]:
if printing:
    print("- making changes to fund_summary_ActiveEq... ")
for i in range(multiCounter.shape[0]):
    fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_cl_grp == multiCounter.loc[i, 'crsp_cl_grp']) &
                              (fund_summary_ActiveEq.caldt == multiCounter.loc[i, 'caldt']),
                              'crsp_portno'] = multiCounter.loc[i, 'crsp_portno']
if printing:
    print("Multiple portnos taken care of!")

# Compare mtna (monthly TNA from return file) and tna_latest (quarterly)

Note: the final distribution of tna_latest is checked as follows:

fund_summary_ActiveEq.tna_latest.hist()

fund_summary_ActiveEq.tna_latest.describe(percentiles=percentilesDetailPlus)

b = fund_summary_ActiveEq[colList].sort_values(by=['tna_latest'], ascending=False)

d = c[['caldt', 'fund_name_short', 'tna_latest']]

## Create data frame for comparision

In [None]:
fund_tna = fund_summary_ActiveEq[['crsp_fundno', 'caldt', 'tna_latest']]
fund_tna = fund_tna.rename(columns={'caldt': 'monthend'})
df = monthly_return.reset_index()
df = df.loc[df.monthend.apply(lambda x: x.month).isin([3, 6, 9, 12])]
fund_tna = pd.merge(fund_tna, df, how='left', on=['crsp_fundno', 'monthend'])

## Cross filling

In [None]:
# - Check how many observations would be impacted by cross-filling
chk = fund_tna[(fund_tna.tna_latest.isnull() & ~fund_tna.mtna.isnull()) |
               (~ fund_tna.tna_latest.isnull() & fund_tna.mtna.isnull())]
if printing:
    print('Cross-filling tna_latest and mtna for missing values')
    print('- the number of observations impacted by cross-filling is:', len(chk))
fund_tna.loc[fund_tna.tna_latest.isnull(), 'tna_latest'] = fund_tna.loc[fund_tna.tna_latest.isnull(), 'mtna']
fund_tna.loc[fund_tna.mtna.isnull(), 'mtna'] = fund_tna.loc[fund_tna.mtna.isnull(), 'tna_latest']

## Create variables for comparision

In [None]:
# Cross-checking for outliers & mistakes
print('Cross-checking for outliers & mistakes')
print('- creating variables for comparision')
fund_tna['diff_rate'] = \
    [abs(fund_tna.mtna[i] - fund_tna.tna_latest[i])/max(fund_tna.mtna[i],
                                                        fund_tna.tna_latest[i]) for i in range(fund_tna.shape[0])]
fund_tna['diff_abs'] = fund_tna.mtna - fund_tna.tna_latest
fund_tna['diff_abs'] = fund_tna['diff_abs'].apply(abs)
fund_tna = fund_tna.sort_values(['crsp_fundno', 'monthend'])
fund_tna['lag_mtna'] = fund_tna.groupby('crsp_fundno').mtna.shift(1)
fund_tna['mtna_change_rate'] = fund_tna.mtna/fund_tna.lag_mtna - 1
fund_tna['next_mtna_change_rate'] = fund_tna.groupby('crsp_fundno').mtna_change_rate.shift(-1)
fund_tna['lag_tna_latest'] = fund_tna.groupby('crsp_fundno').tna_latest.shift(1)
fund_tna['tna_latest_change_rate'] = fund_tna.tna_latest/fund_tna.lag_tna_latest - 1
fund_tna['next_tna_latest_change_rate'] = fund_tna.groupby('crsp_fundno').tna_latest_change_rate.shift(-1)
cols = ['mtna_change_rate', 'next_mtna_change_rate', 'tna_latest_change_rate', 'next_tna_latest_change_rate']
fund_tna[cols] = fund_tna[cols].apply(abs)
print('Comparison variables constructed!')

In [None]:
print('- the description of the comparison variables is:')
cols += ['diff_rate', 'diff_abs']
fund_tna[cols].describe(percentiles=percDetail)

## Find crsp_fundno with mtna and tna_latest having more than 1% difference

In [None]:
print('- finding crsp_fundno with mtna and tna_latest having more than 1% difference')
fund_to_check = fund_tna.loc[fund_tna.diff_rate > 0.01].crsp_fundno.unique()
print('Number of crsp_fundno with significantly different mtna and tna_latest:', len(fund_to_check))

## Handle large differences

In [None]:
for fund in fund_to_check:
    df = fund_tna.loc[fund_tna.crsp_fundno == fund]
    if sum(df.diff_rate.dropna() > 0.01) == sum(df.diff_rate.dropna() > np.exp(-3)):
        # For the case where most are the same but some are obviously different
        fund_to_check = np.delete(fund_to_check, np.where(fund_to_check == fund))
        fund_tna.loc[fund_tna.crsp_fundno == fund, 'tna_latest'] = fund_tna.loc[fund_tna.crsp_fundno == fund, 'mtna']
    else:
        # For those that have a continuous up and down
        if np.all(df.loc[df.diff_rate > 0.01, ['tna_latest_change_rate', 'next_tna_latest_change_rate']] > 0.01):
            fund_tna.loc[fund_tna.crsp_fundno == fund, 'tna_latest'] = \
                fund_tna.loc[fund_tna.crsp_fundno == fund, 'mtna']
            fund_to_check = np.delete(fund_to_check, np.where(fund_to_check == fund))
        elif np.all(df.loc[df.diff_rate > 0.01, ['mtna_change_rate', 'next_mtna_change_rate']] > 0.01):
            fund_tna.loc[fund_tna.crsp_fundno == fund, 'mtna'] = fund_tna.loc[fund_tna.crsp_fundno == fund,
                                                                              'tna_latest']
            fund_to_check = np.delete(fund_to_check, np.where(fund_to_check == fund))
        else:
            if np.all(df.diff_abs.dropna() < 50):
                # For the case where the max absolute difference is less than 50 millions
                fund_to_check = np.delete(fund_to_check, np.where(fund_to_check == fund))
                fund_tna.loc[fund_tna.crsp_fundno == fund, 'tna_latest'] = \
                    fund_tna.loc[fund_tna.crsp_fundno == fund, 'mtna']
            else:
                # the others
                fund_tna.loc[(fund_tna.crsp_fundno.isin(fund_to_check)) &
                             (fund_tna.diff_rate > 0.01), ['tna_latest', 'mtna']] = np.nan
fund_tna = fund_tna[['crsp_fundno', 'monthend', 'tna_latest', 'mtna']]

## Save to file

In [None]:
fund_tna.to_csv(outputPath + Compared_TNA_name, index=False)
print('Large differences were handled and the dataframe was saved to file!')

## Modify tna-latest in fund_summary_ActiveEq

In [None]:
print('- before the modification the distribution of tna_latest is:', 
      fund_summary_ActiveEq.tna_latest.describe())
print('- before the modification the number of missing tna_latest is:', 
      fund_summary_ActiveEq[fund_summary_ActiveEq.tna_latest.isnull()].shape[0])

In [None]:
print('Modifying tna_latest in fund_summary_ActiveEq...')
fund_tna.rename(columns={'monthend': 'caldt'}, inplace=True)
fund_summary_ActiveEq.drop(['tna_latest'], axis=1, inplace=True)
fund_summary_ActiveEq = pd.merge(fund_summary_ActiveEq, fund_tna, how='left', on=['crsp_fundno', 'caldt'])
print('Modification done!')

In [None]:
print('- after the modification the distribution of tna_latest is:')
print(fund_summary_ActiveEq.tna_latest.describe(percentiles=percDetail))
print('- after the modification the number of missing tna_latest is:',
      fund_summary_ActiveEq[fund_summary_ActiveEq.tna_latest.isnull()].shape[0])

## Remove observations with TNA < 5M


In [None]:
fund_summary_ActiveEq = fund_summary_ActiveEq.loc[(fund_summary_ActiveEq['tna_latest'].isnull()) |
                                                  (fund_summary_ActiveEq['tna_latest'] > 5)]
print('- the final distribution of tna_latest is:')
print(fund_summary_ActiveEq.tna_latest.describe(percentiles=percDetail))
print('- the final number of missing tna_latest is:',
      fund_summary_ActiveEq[fund_summary_ActiveEq.tna_latest.isnull()].shape[0])

# Check variables mapping and distribution

## Check wficn crsp_cl_grp mapping

In [None]:
a = fund_summary_ActiveEq.groupby('crsp_cl_grp')['wficn'].nunique().reset_index()
a = a[a.wficn > 1].shape[0]
print('The number of instances in which a crsp_cl_grp is associated with more than one wficn is:', a)

In [None]:
aa = fund_summary_ActiveEq.groupby(['crsp_cl_grp', 'caldt'])['wficn'].nunique().reset_index()
a1 = aa[aa.caldt < dt.datetime(2008, 9, 1)]
a1 = a1.wficn.value_counts()
a2 = aa[aa.caldt >= dt.datetime(2008, 9, 1)]
a2 = a2.wficn.value_counts()
print('The frequency of the number of wficn per crsp_cl_grp-caldt before September 2008 is:')
print(a1)
print('The frequency of the number of wficn per crsp_cl_grp-caldt from September 2008 is:')
print(a2)

In [None]:
f = fund_summary_ActiveEq[
    (~fund_summary_ActiveEq.crsp_cl_grp.isnull()) & ~fund_summary_ActiveEq.crsp_portno.isnull()]
f = f.groupby(['wficn', 'caldt'])['crsp_cl_grp'].nunique().reset_index()
f = f[f.crsp_cl_grp > 1]
print('The number of times times in which a wficn is associated to multiple crsp_cl_grp in the same date is:',
      f.shape[0])

## Check crsp_cl_grp portno mapping

In [None]:
b = fund_summary_ActiveEq.groupby(['crsp_cl_grp', 'caldt'])['crsp_portno'].nunique().reset_index()
b1 = b[b.caldt < dt.datetime(2008, 9, 1)]
b1 = b1.crsp_portno.value_counts()
b2 = b[b.caldt >= dt.datetime(2008, 9, 1)]
b2 = b2.crsp_portno.value_counts()
print('The frequency of the number of crsp_portno per crsp_cl_grp-caldt before September 2008 is:')
print(b1)
print('The frequency of the number of crsp_portno per crsp_cl_grp-caldt from September 2008 is:')
print(b2)

In [None]:
d = fund_summary_ActiveEq[(~fund_summary_ActiveEq.crsp_cl_grp.isnull()) & ~fund_summary_ActiveEq.crsp_portno.isnull()]
d = d.groupby(['crsp_portno', 'caldt'])['crsp_cl_grp'].nunique().reset_index()
d = d[d.crsp_cl_grp > 1]
print('The number of times times in which a portno is associated to multiple crsp_cl_grp in the same date is:',
      d.shape[0])
print('The frequency of the number of associated crsp_cl_grp in this subsample is:')
print(d.crsp_cl_grp.value_counts())
print('NB! THIS NUMBER SHOULD HAVE BEEN ZERO!')

## Check crsp_cl_grp fundno mapping

In [None]:
c = fund_summary_ActiveEq.groupby(['crsp_cl_grp', 'caldt'])['crsp_fundno'].nunique().reset_index()
c = c.crsp_fundno.value_counts()
print('The frequency of the number of crsp_fundno per crsp_cl_grp-caldt is:')
print(c)

In [None]:
e = fund_summary_ActiveEq[(~fund_summary_ActiveEq.crsp_cl_grp.isnull()) & ~fund_summary_ActiveEq.crsp_portno.isnull()]
e = e.groupby(['crsp_fundno', 'caldt'])['crsp_cl_grp'].nunique().reset_index()
e = e[e.crsp_cl_grp > 1]
print('The number of times times in which a fundno is associated to multiple crsp_cl_grp in the same date is:',
      e.shape[0])

# Aggregate information of different share classes of the same fund for the same month into a unique observation

##  For variables that are unique across funds of the same group but vary over time:
        - adv_name mgmt_name mgr_dt mgr_name fund_name NASDAQ wficn crsp_obj_cd lipper_obj_cd

In [None]:
# Fix Typos
for typo in Typos:
    fund_summary_ActiveEq.loc[(fund_summary_ActiveEq['crsp_cl_grp'] == typo['crsp_cl_grp']) &
                              (fund_summary_ActiveEq['caldt'] == typo['caldt']), 'mgr_name'] = \
        typo['Correct Mgr_name']

In [None]:
# Check uniqueness of variables per group/monthend:
for feature in features:
    df = fund_summary_ActiveEq.loc[(fund_summary_ActiveEq[feature].notnull()), ]
    df = df.groupby(['crsp_cl_grp', 'caldt'])[feature].nunique()
    if printing:
        print('Number of observations having multiple', feature, ':', sum(df > 1))
# Save all exceptions in files
for feature in features:
    Util.exceptions_tofile(fund_summary_ActiveEq, feature, outputPath)

In [None]:
# Use the mode for the multi-value case
agg_fund_summary = \
    fund_summary_ActiveEq.groupby(['crsp_cl_grp', 'caldt'])[features].agg(Util.mode_with_NA).reset_index()

In [None]:
# Use the oldest mgr_dt for the multi-value case
agg_fund_summary.mgr_dt = \
    fund_summary_ActiveEq.groupby(['crsp_cl_grp',
                                   'caldt'])['mgr_dt'].apply(lambda x: x.dropna().min()).reset_index(drop=True)
print('Aggregation complete.')

## Sum TNA of all CRSP_FUNDNO with the same CRSP_CL_GRP and CALDT (variable: TNA_LATEST)

Note: the final distribution of the aggregated tna_latest variable is checked as follows:
    
    agg_fund_summary.tna_latest.hist()
    
    agg_fund_summary.tna_latest.describe(percentiles=percentilesDetailPlus)
    
    b = agg_fund_summary[['caldt', 'fund_name_short', 'tna_latest']].sort_values(by=['tna_latest'], ascending=False)


In [None]:
print('Aggregating TNA information by summing it across funds per group/date...')
# Check the status of missing TNA
tnamiss_grp = list(fund_summary_ActiveEq.loc[fund_summary_ActiveEq['tna_latest'].isnull()]['crsp_cl_grp'])
tnamiss_grp = list(set(tnamiss_grp))
# Delete nan
tnamiss_grp = [x for x in tnamiss_grp if x == x]
print('Number of crsp_cl_grp having miss TNA: ', len(tnamiss_grp))
num_total_missing = 0
num_partial_missing = 0
tna_miss = []
for grp in tnamiss_grp:
    df = fund_summary_ActiveEq.loc[fund_summary_ActiveEq.crsp_cl_grp == grp]
    df2 = df.groupby('caldt')['fund_name_short', 'tna_latest'].count()
    df2 = df2.query('fund_name_short != tna_latest')
    for i in range(df2.shape[0]):
        month = df2.index[i]
        if df2.iloc[i]['tna_latest'] == 0:
            tna_miss.append((grp, month, 'all missing', 1))
            num_total_missing += 1
        else:
            if df2.iloc[i]['fund_name_short'] != 0:
                tna_miss.append(
                    (grp, month, 'partially missing', 1 - df2.iloc[i]['tna_latest'] / df2.iloc[i]['fund_name_short']))
                num_partial_missing += 1
tna_miss = pd.DataFrame(tna_miss, columns=['CRSP_CL_GRP', 'MONTHEND', 'TNA_MISSING STATUS', 'MISSING RATE'])

In [None]:
print('Number of crsp_cl_grp/monthend observations missing TNA for all funds: ', num_total_missing)
print('Number of crsp_cl_grp/monthend observations missing TNA for some funds: ', num_partial_missing)
print('Average of TNA missing rate: ',
      np.mean(tna_miss.loc[tna_miss['TNA_MISSING STATUS'] == 'partially missing', 'MISSING RATE']))
print('Variance of TNA missing rate: ',
      np.var(tna_miss.loc[tna_miss['TNA_MISSING STATUS'] == 'partially missing', 'MISSING RATE']))
# Sum TNA of all CRSP_FUNDNO with the same CRSP_CL_GRP and CALDT
agg_fund_summary['tna_latest'] = \
            fund_summary_ActiveEq.groupby(['crsp_cl_grp',
                                           'caldt'])['tna_latest'].sum(min_count=1).reset_index(drop=True)
# agg_fund_summary.tna_latest = agg_fund_summary.tna_latest.apply(lambda x: np.nan if x == 0 else x)
print('Aggregation complete!')

In [None]:
print('The distribution of the aggregated tna_latest is:')
print(agg_fund_summary.tna_latest.describe(percentiles=percDetail))
print('The shape of agg_fund_summary after dealing with tna_latest is: ')
print(str(agg_fund_summary.shape))

## Use the oldest fund for qualitative information and fill if forward for all months
       – Variables: Fund_name, FIRST_OFFER_DT

In [None]:
group_oldest_info = fund_summary_ActiveEq.groupby('crsp_cl_grp')['first_offer_dt'].min().reset_index()
group_oldest_info = \
            pd.merge(group_oldest_info, fund_summary_ActiveEq[['crsp_cl_grp', 'first_offer_dt', 'fund_name_short']],
                     how='left', on=['crsp_cl_grp', 'first_offer_dt']).drop_duplicates(['crsp_cl_grp',
                                                                                        'first_offer_dt'])
group_oldest_info.rename(columns={'fund_name_short': 'fund_name_first_offer'}, inplace=True)
agg_fund_summary = pd.merge(agg_fund_summary, group_oldest_info, how='left', on='crsp_cl_grp')
print('Aggregation complete.')
print('The shape of agg_fund_summary after adding the oldest qualitative info is: ')
print(str(agg_fund_summary.shape))

## Compute the weighted average of numerical variables using the one month lag of TNA_LATEST as weights
       (note that we compare caldt to tna_latest to determine if tna_latest is already a stale - lagged value):
        - Variables: per_cash per_com per_pref per_conv per_corp per_muni per_govt per_oth per_cash per_bond 
                     per_abs per_mbs per_eq_oth per_fi_oth exp_ratio turn_ratio FRONT_LOAD Rear_load Load AvEq 
                     mgmt_fee ACTUAL_12B1
        - Where TNA is available for all CRSP_CL_GRP in a given CALDT, then the normalized TNA weight is used.
          When TNA is missing then equal weights are used; If TNA is missing for some but not all funds still use
          equal weights. Keep track of the number of the exceptions and save them in a separate file
        - The same procedure is later applied to monthly returns (see returns.py)

In [None]:
print('Computing the weighted average of numerical variables using lagged TNA...')
# Create weighted variables using tna_latest
weighted_averages = \
            fund_summary_ActiveEq.groupby(['crsp_cl_grp',
                                           'caldt']).apply(Util.weighted_mean, wtMeanList, w="tna_latest")
agg_fund_summary = pd.concat([agg_fund_summary, weighted_averages.reset_index(drop=True)], axis=1)
# Create tna_adjust
fund_summary_ActiveEq = fund_summary_ActiveEq.sort_values(['crsp_fundno', 'caldt'])
fund_summary_ActiveEq['tna_adjust'] = fund_summary_ActiveEq.groupby(['crsp_fundno'])['tna_latest'].shift(1)
fund_summary_ActiveEq.loc[fund_summary_ActiveEq.tna_latest_dt < fund_summary_ActiveEq.caldt, 'tna_adjust'] = \
            fund_summary_ActiveEq.loc[fund_summary_ActiveEq.tna_latest_dt < fund_summary_ActiveEq.caldt, 'tna_latest']
df = fund_summary_ActiveEq.groupby('crsp_fundno')[['caldt', 'tna_latest']].first().reset_index()
for i in range(df.shape[0]):
    fund_summary_ActiveEq.loc[(fund_summary_ActiveEq.crsp_fundno == df['crsp_fundno'][i]) &
                              (fund_summary_ActiveEq.caldt == df['caldt'][i]), 'tna_adjust'] = df['tna_latest'][i]
# Create weighted variables using tna_adjust
weighted_averages_adj = \
            fund_summary_ActiveEq.groupby(['crsp_cl_grp',
                                           'caldt']).apply(Util.weighted_mean, wtMeanList, w='tna_adjust')
weighted_averages_adj.columns = [x + '_adj' for x in weighted_averages_adj.columns]
agg_fund_summary = pd.concat([agg_fund_summary, weighted_averages_adj.reset_index(drop=True)], axis=1)
print('Aggregation complete!')
print('The shape of agg_fund_summary after adding the tna_weighted variables is: ')
print(str(agg_fund_summary.shape))

## For what regards CRSP_PORTNO there should be a unique PORTNO per CRSP_CL_GRP and monthend
- Keep the PORTNO for as many months as possible (fill missing value using neighbors)
- Also explored in the homework


# Dataset Summary

In [None]:
if printing:
    print('-------------------------------------------------')
    print('The final aggregated dataset has the following characteristics:')
# Frequency of categorical variables
categorical_variables = ['crsp_obj_cd', 'lipper_obj_cd']
if printing:
    for col in categorical_variables:
        print('Frequency of', col, 'in fund_summary_US_Active:')
        print(agg_fund_summary[col].value_counts())
        print(agg_fund_summary.drop_duplicates('crsp_cl_grp')[col].value_counts())
print('The number of unique crsp_cl_grp is: ', agg_fund_summary.crsp_cl_grp.nunique())


# Save Aggregate_fund_summary to file

In [None]:
if printing:
    print('-------------------------------------------------')
    print("Saving the full and aggregated datasets to file...")
agg_fund_summary.to_csv(outputPath + fund_summary_agg_ActiveEq_name, index=False)
fund_summary_ActiveEq.to_csv(outputPath + fund_summary_ActiveEq_name, index=False)
