# Lecture 1.2 : Clean & sub-set summary data
## IDS : 
- CRSP_FUNDNO = sub-fund (share class) 
- CALDT = calendar date at quarter end

## Objectives : 
1. Apply general filters to the whole dataset
2. Subset the selected funds further to identify balanced, US active equity funds

## Import Settings

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
from Functions import Utilis as Util
from settings import *

## Apply Filters to Entire Dataset

### Loading & Describig files 

In [None]:
print('Loading needed files...')
t0 = time.time()
fund_summary = pd.read_csv(inputPath + fund_summary_name, low_memory=False)
CRSP_FUNDNO_WFICN = pd.read_csv(inputPath + CRSP_FUNDNO_WFICN_name, low_memory=False, encoding='latin1')
FRONT_LOAD = pd.read_csv(inputPath + FRONT_LOAD_name, low_memory=False)
REAR_LOAD = pd.read_csv(inputPath + REAR_LOAD_name, low_memory=False)
t1 = time.time()
print('Files were loaded in', t1-t0, 's!')

In [None]:
print('***DESCRIBING crsp_summary***')
print('The shape of fund_summary is:', fund_summary.shape)
print('-----.....-----.....-----')
Util.isnull_chk(fund_summary, 'crsp_fundno')
print('The number of unique crsp_fundno is:', fund_summary['crsp_fundno'].nunique())
print('-----.....-----.....-----')
Util.isnull_chk(fund_summary, 'crsp_cl_grp')
print('The number of unique crsp_cl_grp is:', fund_summary['crsp_cl_grp'].nunique())
print('-----.....-----.....-----')
Util.isnull_chk(fund_summary, 'crsp_portno')
print('The number of unique crsp_portno is:', fund_summary['crsp_portno'].nunique())

In [None]:
print('The average number of crsp_fundno per crsp_cl_grp per date:', 
      fund_summary.groupby(['crsp_cl_grp', 'caldt'])['crsp_fundno'].nunique().mean())
print('The average number of crsp_fundno per crsp_cl_grp over time:', 
      fund_summary.groupby(['crsp_cl_grp'])['crsp_fundno'].nunique().mean())
print('-----.....-----.....-----')
print('The average number of crsp_portno per crsp_cl_grp per date:', 
      fund_summary.groupby(['crsp_cl_grp', 'caldt'])['crsp_portno'].nunique().mean())
print('The average number of crsp_portno per crsp_cl_grp over time:', 
      fund_summary.groupby(['crsp_cl_grp'])['crsp_portno'].nunique().mean())

In [None]:
fund_summary.head(8)

In [None]:
print('***DESCRIBING CRSP_FUNDNO_WFICN***')
print('The shape of CRSP_FUNDNO_WFICN is:', CRSP_FUNDNO_WFICN.shape)
print('The columns of CRSP_FUNDNO_WFICN are:')
print(CRSP_FUNDNO_WFICN.columns)
CRSP_FUNDNO_WFICN.head(10)

In [None]:
print('***DESCRIBING FRONT_LOAD***')
print('The shape of FRONT_LOAD is:', FRONT_LOAD.shape)
print('The columns of FRONT_LOAD are:') 
print(FRONT_LOAD.columns)
FRONT_LOAD.head(10)

In [None]:
print('***DESCRIBING REAR_LOAD***')
print('The shape of REAR_LOAD is:', REAR_LOAD.shape)
print('The columns of REAR_LOAD are:')
print(REAR_LOAD.columns)
REAR_LOAD.head(10)

### Converting data to correct type & transform dates to month end

In [None]:
print('Before:', type(fund_summary.caldt[0]), ' ', fund_summary.caldt[0])
print('The unique dates of fund_summary are:')
print(fund_summary.caldt.unique())
print('The unique beddt of FRONT_LOAD are:')
print(FRONT_LOAD.begdt.unique())

In [None]:
t0 = time.time()
print('Converting columns to the correct data types & transforming dates to monthend...')
print('**This might take some time** -> Bonus points if you can do this more efficiently?')
fund_summary.tna_latest = pd.to_numeric(fund_summary.tna_latest, errors='coerce') #invalid parsing will be set to NaN
fund_summary_columns = list(fund_summary.columns)
col_dt_list = [x for x in fund_summary_columns if x.endswith('dt')]
for col in col_dt_list:
    fund_summary[col] = pd.to_datetime(fund_summary[col])
    fund_summary[col] = fund_summary[col].apply(lambda x: x + relativedelta(day=31))
col_dt_list = [x for x in FRONT_LOAD if x.endswith('dt')]
for col in col_dt_list:
    FRONT_LOAD[col] = pd.to_datetime(FRONT_LOAD[col])
    FRONT_LOAD[col] = FRONT_LOAD[col].apply(lambda x: x + relativedelta(day=31))
    REAR_LOAD[col] = pd.to_datetime(REAR_LOAD[col])
    REAR_LOAD[col] = REAR_LOAD[col].apply(lambda x: x + relativedelta(day=31))
t1 = time.time()
print('Columns have been converted in', t1-t0, 's!')

In [None]:
print('After:', type(fund_summary.caldt[0]), ' ', fund_summary.caldt[0])
print('The unique dates of fund_summary are:')
print(fund_summary.caldt.unique())
print('The unique beddt of FRONT_LOAD are:')
print(FRONT_LOAD.begdt.unique())

### Keeping only observations >= Dec 31st 1989

In [None]:
print('Before')
print('The mininum date in fund_summary is:', fund_summary.caldt.min())
fund_summary = fund_summary.loc[fund_summary['caldt'] >= dt.datetime(1989, 12, 31)]
print('After')
print('The mininum date in fund_summary is:', fund_summary.caldt.min())

### Eliminating small funds: Keep if TNA_LATEST > 5 or TNA_LATEST is missing


In [None]:
fundno_tna_latest_less5 = fund_summary.loc[fund_summary['tna_latest'] <= 5]['crsp_fundno']
print('The total number of CRSP_FUNDNO is:', fund_summary.tna_latest.nunique())
print('The number of CRSP_FUNDNO with TNA less than 5 is:', fundno_tna_latest_less5.nunique())
print('The total number of observations is:', fund_summary.shape[0])
print('The number of observations with TNA less than 5 is:', 
      fund_summary.loc[fund_summary['crsp_fundno'].isin(fundno_tna_latest_less5)].shape)

In [None]:
print('The shape of Fund_Summary before eliminating small funds is:', fund_summary.shape)
fund_summary['bad_obs'] = 0
fund_summary.loc[fund_summary.tna_latest < 0, 'bad_obs'] = 1
fund_summary.loc[fund_summary.tna_latest < 0, 'tna_latest'] = np.NaN
fund_summary = fund_summary.loc[(fund_summary['tna_latest'].isnull()) | (fund_summary['tna_latest'] > 5)]
print('The shape of Fund_Summary after eliminating small funds is:', fund_summary.shape)

### Adjusting for incubation bias: keep OBSERVATIONS (not funds) for which FIRST_OFFER_DT <= CALDT


In [None]:
print('The shape of Fund_Summary before adjusting for incubation bias is:', fund_summary.shape[0])
fund_summary = fund_summary.loc[(fund_summary['first_offer_dt'] <= fund_summary['caldt']) |
                                (fund_summary['first_offer_dt'].isnull())]
print('The shape of Fund_Summary after adjusting for incubation bias is:', fund_summary.shape[0])

### Adjusting expenses, cash and fees for outliers s.t.
- if [turn_ratio, exp_ratio, ACTUAL_12B1] = -99.00 then [turn_ratio, exp_ratio, ACTUAL_12B1] = np.nan
- if ACTUAL_12B1 = 0 & CALDT < '01Jan1998'd then ACTUAL_12B1 = np.nan
- Set the negative ACTUAL_12B1 values as missing values
- if mgmt_fee < -50.00 then mgmt_fee = np.nan
- if per_cash < -150.00 then per_cash = np.nan
- if per_cash > 170.00 then per_cash = np.nan
- if per_com == 999.99 then per_com = 99.999;
- if per_com > 400 then per_com = np.nan

#### How do you pick these thresholds? 
- Understand the data!!
1. Consult the manual: http://www.crsp.com/files/MFDB_Guide.pdf 
2. Use your common sense
    - When you see a bunch of 99999 or -999999 it is probably a hard-coding for missing values
    - Check variables distribution
    - Manually check outliers

In [None]:
print('Adjusting variables...')
fund_summary.loc[:, 'turn_ratio'] = [np.nan if x == -99 else x for x in fund_summary['turn_ratio']]
fund_summary.loc[:, 'exp_ratio'] = [np.nan if x == -99 else x for x in fund_summary['exp_ratio']]
fund_summary.loc[:, 'actual_12b1'] = [np.nan if x == -99 else x for x in fund_summary['actual_12b1']]
fund_summary.loc[(fund_summary.actual_12b1 == 0) & (fund_summary.caldt < dt.datetime(1998, 1, 1)),'actual_12b1'] = np.nan
fund_summary['actual_12b1'] = [np.nan if x < 0 else x for x in fund_summary['actual_12b1']]
fund_summary.loc[:, 'mgmt_fee'] = [np.nan if x < -50 else x for x in fund_summary['mgmt_fee']]
fund_summary.loc[:, 'per_cash'] = [np.nan if ((x < -150) or (x > 170)) else x for x in fund_summary['per_cash']]
fund_summary.loc[:, 'per_com'] = [99.999 if x == 999.99 else x for x in fund_summary['per_com']]
fund_summary.loc[:, 'per_com'] = [np.nan if x > 400 else x for x in fund_summary['per_com']]
print('Variables were adjusted!')
print('Number of observations having negative exp_ratio', sum(fund_summary['exp_ratio'] < 0))
print('Number of observations having negative ACTUAL_12B1', sum(fund_summary['actual_12b1'] < 0))
print('Number of observations having negative mgmt_fee', sum(fund_summary['mgmt_fee'] < 0))

#### Does it make sense to have negarive mgmt_fee?
- Instinctively you would probably say no BUT
- Consult the manual first: http://www.crsp.com/files/MFDB_Guide.pdf 

### Adding information about loads from separate file
Load = rear_load + front_load
Before doing that though, you need to:
1. Adjust for outliers : 
    - if REAR_LOAD = -99 then REAR_LOAD = np.nan 
    - if FRONT_LOAD = -99 then FRONT_LOAD = np.nan;
2. Adjust the begin and end dates to monthend (BEGDT, ENDDT)
3. Fill information forward at the monthly frequency
4. For each fund/month observation take the average of all available front_loads and read_loads
    - Different share classes might have different loads

In [None]:
print('Setting -99 values to missing') 
FRONT_LOAD['front_load'] = [np.nan if x == -99 else x for x in FRONT_LOAD['front_load']]
FRONT_LOAD.loc[FRONT_LOAD['front_load'].isnull(), 'front_load'] = 0
REAR_LOAD['rear_load'] = [np.nan if x == -99 else x for x in REAR_LOAD['rear_load']]
REAR_LOAD.loc[REAR_LOAD['rear_load'].isnull(), 'rear_load'] = 0
FRONT_LOAD['front_load'].head(5)

In [None]:
if printing:
    print('Expanding FRONT LOADS...')
    print('This will also take some time -> Bonus points is you can improve this?')    
FRONT_LOAD['index'] = FRONT_LOAD.index
t0 = time.time()
frontload_expand_dict = {}
for i in range(len(FRONT_LOAD)):
    start = FRONT_LOAD['begdt'][i]
    end = FRONT_LOAD['enddt'][i]
    frontload_expand_dict[i] = [Util.last_day_of_month(dt.date(m//12, m%12+1, 1)) 
                                for m in range(start.year*12+start.month-1, end.year*12+end.month)]
df = pd.DataFrame(pd.Series(frontload_expand_dict, name='date'), columns=['date'])
df2 = pd.DataFrame(pd.DataFrame(df.date.values.tolist(), index= df.index).stack()).reset_index()
df2 = df2.drop(['level_1'], axis = 1)
df2.columns = ['index', 'date']
FRONT_LOAD2 = pd.merge(FRONT_LOAD, df2, on = 'index', how = 'outer')
FRONT_LOAD2 = FRONT_LOAD2.drop(['index'], axis = 1)
del df
del df2
t1 = time.time()
print('Finished Expanding Front Loads in:', t1-t0, 's')

In [None]:
print('Expanding REAR LOADS...')
print('Same as above, this will also take some time...')
REAR_LOAD['index'] = REAR_LOAD.index
t0 = time.time()
rearload_expand_dict = {}
for i in range(len(REAR_LOAD)):
    start = REAR_LOAD['begdt'][i]
    end = REAR_LOAD['enddt'][i]
    rearload_expand_dict[i] = [Util.last_day_of_month(dt.date(m//12, m%12+1, 1)) 
                                for m in range(start.year*12+start.month-1, end.year*12+end.month)]
df = pd.DataFrame(pd.Series(rearload_expand_dict, name='date'), columns=['date'])
df2 = pd.DataFrame(pd.DataFrame(df.date.values.tolist(), index= df.index).stack()).reset_index()
df2 = df2.drop(['level_1'], axis = 1)
df2.columns = ['index', 'date']
REAR_LOAD2 = pd.merge(REAR_LOAD, df2, on = 'index', how = 'outer')
REAR_LOAD2 = REAR_LOAD2.drop(['index'], axis = 1)
del df
del df2
t1 = time.time()
print('Finished Expanding Rear Loads in:', t1-t0, 's')

In [None]:
print('Checking output...')
FRONT_LOAD = FRONT_LOAD.sort_values(['crsp_fundno', 'begdt', 'enddt'])
FRONT_LOAD2 = FRONT_LOAD2.sort_values(['crsp_fundno', 'date'])
print('Shape before expansion:', FRONT_LOAD.shape)
print('Shape after expansion:', FRONT_LOAD2.shape)
print(FRONT_LOAD2.head(20))

In [None]:
print('Taking the average of front and end loads by fund-monthend...')
print('This takes even longer to run!! Can you thing of an entirely different way of doing this?')
t0 = time.time()
agg_FRONT_LOAD = FRONT_LOAD2.groupby(['crsp_fundno', 'date'])[['front_load']].apply(np.mean).reset_index()
agg_REAR_LOAD = REAR_LOAD2.groupby(['crsp_fundno', 'date'])[['rear_load']].apply(np.mean).reset_index()
agg_LOADs = pd.merge(agg_FRONT_LOAD, agg_REAR_LOAD, left_on=['crsp_fundno', 'date'], how='outer',right_on=['crsp_fundno', 'date'])
agg_LOADs.loc[agg_LOADs['rear_load'].isnull(), 'rear_load'] = 0
agg_LOADs.loc[agg_LOADs['front_load'].isnull(), 'front_load'] = 0
agg_LOADs['load'] = agg_LOADs['front_load'] + agg_LOADs['rear_load']
agg_LOADs.rename(columns={'date': 'caldt'}, inplace=True)
agg_LOADs.caldt = pd.to_datetime(agg_LOADs.caldt)
t1 = time.time()
print('Averaging completed in', t1-t0, 's')
print(agg_LOADs.head(5))

#### Merging aggregate loads to full dataset

In [None]:
print('The shape of Fund_Summary before adding loads is:', fund_summary.shape[0])
fund_summary = pd.merge(fund_summary, agg_LOADs, left_on=['crsp_fundno', 'caldt'], how='left',right_on=['crsp_fundno', 'caldt'])
print('The shape of Fund_Summary after adding loads is:', fund_summary.shape[0])

### Merge the WFICN identifier by CRSP_FUNDNO using the Mflinks_crsp dataset on WRDS

In [None]:
CRSP_FUNDNO_WFICN.wficn = CRSP_FUNDNO_WFICN.wficn.apply(lambda x: int(x))
ids = CRSP_FUNDNO_WFICN['crsp_fundno']
print('The number of CRSP_FUNDNOs having multiple WFICN is:', CRSP_FUNDNO_WFICN[ids.isin(ids[ids.duplicated()])].shape[0])
print('Dropping duplicates...')
print('Can you do any better than just dropping duplicates??')
CRSP_FUNDNO_WFICN = CRSP_FUNDNO_WFICN[['crsp_fundno', 'wficn']]
CRSP_FUNDNO_WFICN = CRSP_FUNDNO_WFICN.drop_duplicates(subset=['crsp_fundno'], keep='first')
print('Duplicates dropped! Merging to fund summary...')
print('The shape of Fund_Summary before merging WFICN is: ', str(fund_summary.shape))
fund_summary = pd.merge(fund_summary, CRSP_FUNDNO_WFICN, on='crsp_fundno', how='left')
print('The shape of Fund_Summary after merging WFICN is: ', str(fund_summary.shape))

### Dataset after applying generic filters

In [None]:
print('The shape of fund_summary is:', fund_summary.shape)
fund_summary.head(6)

## Subset the data to US Active Equity

### Subsetting according to the CRSP_OBJ_CD: 
- First digit: Equity (E)
- Second digit: Domestic (D) 
- Third digit: Non-Sector (so either C or Y NOT S)
- Fourth digit: all possible 4th digits

**Check color coded map at the end of the manual: http://www.crsp.com/files/MFDB_Guide.pdf)**

In [None]:
print('The number of observations in Fund_Summary before subsetting is:', fund_summary.shape[0])
con1 = [str(x).startswith('EDC') for x in fund_summary['crsp_obj_cd']]
con2 = [str(x).startswith('EDY') for x in fund_summary['crsp_obj_cd']]
con = [x or y for x, y in zip(con1, con2)]
fund_summary_US_Active = fund_summary.loc[con]
print('The number of observations in Fund_Summary after subsetting is:', fund_summary_US_Active.shape[0])

### Subsetting according to Lipper, Strategic insight and Wiesenberger codes as follows:
- indicator = 0;
- if LIPPER_OBJ_CD = 'SP' or LIPPER_OBJ_CD = 'MC' or LIPPER_OBJ_CD = 'SG' or LIPPER_OBJ_CD = 'MR' or 
     LIPPER_OBJ_CD = 'CA' or LIPPER_OBJ_CD = 'G' or LIPPER_OBJ_CD = 'GI' or LIPPER_OBJ_CD = 'LSE' or 
     LIPPER_OBJ_CD = 'EMN' or LIPPER_OBJ_CD = 'ABR' or LIPPER_OBJ_CD = 'DL' or LIPPER_OBJ_CD = 'EI' then indicator = 1 
- if LIPPER_OBJ_CD = "" and (SI_OBJ_CD = 'GMC' or SI_OBJ_CD = 'SCG' or SI_OBJ_CD = 'AGG' or SI_OBJ_CD = 'GRO' 
     or SI_OBJ_CD = 'GRI' or SI_OBJ_CD = 'ING' or SI_OBJ_CD = 'OPI') then indicator = 1

- if LIPPER_OBJ_CD = "" and SI_OBJ_CD = "" and (WBRGER_OBJ_CD = 'SGC' or WBRGER_OBJ_CD = 'G' or 
     WBRGER_OBJ_CD = 'LTG' or WBRGER_OBJ_CD = 'MCG' or WBRGER_OBJ_CD = 'GCI' or WBRGER_OBJ_CD = 'IEQ') 
     then indicator = 1
- if LIPPER_OBJ_CD = "" and SI_OBJ_CD = "" and WBRGER_OBJ_CD = "" and POLICY = 'CS' then indicator = 1
- if indicator = 1 then KEEP
- if CRSP_OBJ_CD = 'EDS' or CRSP_OBJ_CD = 'EDSU' or CRSP_OBJ_CD = 'EDYI' or CRSP_OBJ_CD = 'EDYS' then DELETE
- if WBRGER_OBJ_CD = 'BAL' or WBRGER_OBJ_CD = 'IFL' then DELETE
- if CRSP_OBJ_CD = "" and LIPPER_OBJ_CD = "" and SI_OBJ_CD = "" and WBRGER_OBJ_CD = "" and POLICY = "" 
     then DELETE

In [None]:
print('The number of observations in Fund_Summary before subsetting is:', fund_summary_US_Active.shape[0])
LIPPER_OBJ_CD = ["MC", 'SG', 'MR', 'CA', 'G', 'GI', 'LSE', 'EMN', 'ABR', 'DL', 'EI']
con1 = fund_summary_US_Active['lipper_obj_cd'].isin(LIPPER_OBJ_CD)
SI_OBJ_CD = ['GMC', 'SCG', 'AGG', 'GRO', 'GRI', 'ING', 'OPI']
con2 = fund_summary_US_Active['lipper_obj_cd'].isnull() & fund_summary_US_Active['si_obj_cd'].isin(SI_OBJ_CD)
WBRGER_OBJ_CD = ['SGC', 'G', 'LTG', 'MCG', 'GCI', 'IEQ']
con3 = fund_summary_US_Active['lipper_obj_cd'].isnull() & fund_summary_US_Active['si_obj_cd'].isnull() & \
        fund_summary_US_Active['wbrger_obj_cd'].isin(WBRGER_OBJ_CD)
con4 = fund_summary_US_Active['lipper_obj_cd'].isnull() & fund_summary_US_Active['si_obj_cd'].isnull() & \
           fund_summary_US_Active['wbrger_obj_cd'].isnull() & fund_summary_US_Active['policy'].isin(['CS'])
fund_summary_US_Active = fund_summary_US_Active.loc[con1 | con2 | con3 | con4]
drop_list_WBRGER_OBJ_CD = ['BAL', 'IFL']
con5 = fund_summary_US_Active['wbrger_obj_cd'].isin(drop_list_WBRGER_OBJ_CD)
con6 = fund_summary_US_Active['lipper_obj_cd'].isnull() & fund_summary_US_Active['si_obj_cd'].isnull() & \
           fund_summary_US_Active['wbrger_obj_cd'].isnull() & fund_summary_US_Active['policy'].isnull()
fund_summary_US_Active = fund_summary_US_Active.loc[~con5 & ~con6]
print('The number of observations in Fund_Summary after subsetting is:', fund_summary_US_Active.shape[0])

### Excluding index funds: 
- funds with INDEX_FUND_FLAG = ‘D’;
- funds which contain any of the following in their names:
    - ‘Index’, ‘Ind’, ‘Idx’, ‘Indx’, ‘iShares’, ‘SPDR’, ‘HOLDRs’, ‘ETF’, ‘Exchange-Traded Fund’, ‘PowerShares’,‘StreetTRACKS’

In [None]:
print('Keeping funds with INDEX_FUND_FLAG = D...')
print('The number of observations in Fund_Summary before subsetting is:', fund_summary_US_Active.shape[0])
fund_summary_US_Active = fund_summary_US_Active.loc[fund_summary_US_Active['index_fund_flag'] != 'D']
print('The number of observations in Fund_Summary after subsetting is:', fund_summary_US_Active.shape[0])

In [None]:
print('Removing funds with index related words in the name (list in settings)...')
print('The number of observations in Fund_Summary before subsetting is:', fund_summary_US_Active.shape[0])
for content in eliminated_content:
    fund_summary_US_Active = fund_summary_US_Active.loc[
        ~fund_summary_US_Active['fund_name'].str.lower().str.contains(content.lower(), na=False)]
print('The number of observations in Fund_Summary after subsetting is:', fund_summary_US_Active.shape[0])

 ### Keeping only open ended funds: if ET_FLAG = 'F' or ET_FLAG = 'N' then DELETE:

In [None]:
print('The number of observations in Fund_Summary before subsetting is:', fund_summary_US_Active.shape[0])
fund_summary_US_Active = fund_summary_US_Active.loc[fund_summary_US_Active['et_flag'] != 'F']
fund_summary_US_Active = fund_summary_US_Active.loc[fund_summary_US_Active['et_flag'] != 'N']
print('The number of observations in Fund_Summary after subsetting is:', fund_summary_US_Active.shape[0])

### Eliminating observations with no fund_name, crsp_cl_grp and wficn...:

In [None]:
print('The shape of Fund_Summary before eliminating observations with no crsp_fundno, fund_name, crsp_cl_grp & wficn is:')
con7 = fund_summary_US_Active.fund_name.isnull() & fund_summary_US_Active.crsp_cl_grp.isnull() & \
        fund_summary_US_Active.wficn.isnull()
fund_summary_US_Active = fund_summary_US_Active[~con7]
print('The shape of Fund_Summary after eliminating observations with no crsp_fundno, fund_name, crsp_cl_grp & wficn is:')
print(fund_summary_US_Active.shape)

### Eliminating Variable Annuity Underlying Funds

In [None]:
print('The shape of Fund_Summary before eliminating Variable Annuity Underlying funds is:', fund_summary_US_Active.shape)
print('Before elimination, the frequency of Variable Annuity (Y) funds vs. the rest (N) is:')
print(fund_summary_US_Active['vau_fund'].value_counts())
fund_summary_US_Active = fund_summary_US_Active[fund_summary_US_Active.vau_fund != 'Y']
fund_summary_US_Active.drop(['vau_fund'], axis=1, inplace=True)
print('The shape of Fund_Summary after eliminating Variable Annuity Underlying funds is:', fund_summary_US_Active.shape)

### Standardize Names: 
- Separate original fund_name to group_name + fund_name_short + share_class, and save the original fund_name as fund_name_long
- share_class: 
    - anything after the last ';' or '/' or ',' or ':' or '\'
    - anything after the word 'class'
    - some special case
- group_name: anything before the first ':'
- fund_name_short: the original fund_name apart from group_name and share_class

#### Separating Fund Names from Group Names and Share Classes

In [None]:
print('An example:')
print(fund_summary_US_Active.fund_name.iloc[100])

##### Getting the sub-parts of the fund names...

In [None]:
fund_summary_US_Active.rename(index=str, columns={"fund_name": "fund_name_long"}, inplace=True)
fund_summary_US_Active['share_class'] = fund_summary_US_Active.fund_name_long.apply(Util.get_share_class)
fund_summary_US_Active['fund_name_short'] = fund_summary_US_Active.fund_name_long.apply(Util.get_short_fundname)
fund_summary_US_Active['group_name'] = fund_summary_US_Active.fund_name_short.apply(Util.get_group_name)
fund_summary_US_Active['fund_name_short'] = fund_summary_US_Active.fund_name_short.apply(Util.update_short_fundname)
print('An example continued...')
fund_summary_US_Active[['group_name', 'share_class', 'fund_name_short', 'fund_name_long']].iloc[100]

#### Handling naming exceptions...

In [None]:
fund_summary_US_Active['group_name'] = fund_summary_US_Active.group_name.apply(Util.correct_name, 
                                                                               exceptions=exceptions, retNaN=False)
fund_summary_US_Active.loc[(fund_summary_US_Active.group_name == "Voyageur Mutual Funds III") &
                               (fund_summary_US_Active.caldt > dt.datetime(2002, 6, 25)) &
                               (fund_summary_US_Active.caldt <= dt.datetime(2009, 1, 8)), 
                           'group_name'] = "voyageur mutual funds iii mn"
print('Corrections completed')

### Use Fund_name's group part to fill in crsp_cl_grp
#### Check how many observations don’t have CRSP_CL_GRP

In [None]:
Util.isnull_chk(fund_summary_US_Active, 'crsp_cl_grp')
print('-----.....-----.....')
print('Before August 13 1998 - first date of validity of crsp_cl_grp:')
a = fund_summary_US_Active[fund_summary_US_Active.caldt <= dt.datetime(1998, 8, 31)]
Util.isnull_chk(a, 'crsp_cl_grp')
b = a[(a.wficn.isnull()) & (a.wficn.isnull())]
print('Of which the ones that that don\'t have both crsp_cl_grp and wficn are:', b.shape[0])
print('The number of observations not having crsp_cl_grp & group_name is:', 
     fund_summary_US_Active.loc[
            (fund_summary_US_Active['crsp_cl_grp'].isnull()) &
            (fund_summary_US_Active['group_name'].isnull())].shape[0])

### Eliminate observations that don't have a caldt

In [None]:
print('The shape fund_summary_US_Active before eliminating observations with no caldt is:', fund_summary_US_Active.shape)
fund_summary_US_Active = fund_summary_US_Active[~fund_summary_US_Active.caldt.isnull()]
print('The shape fund_summary_US_Active after eliminating observations with no caldt is:', fund_summary_US_Active.shape)

### Check uniqness of fund_name & group_name for grp/month

In [None]:
print("Checking uniqness of fund_name & group_name for grp/month...")
for col in ['fund_name_short', 'group_name']:
    non_unique = fund_summary_US_Active.groupby(['crsp_cl_grp', 'caldt'])[col].nunique()
    non_existing = fund_summary_US_Active[fund_summary_US_Active[col].isnull()].drop_duplicates(subset=[
        'crsp_cl_grp', 'caldt'])
    print('Number of grp/month having multiple', col, ': ', sum(non_unique > 1))
    print('Number of grp/month not having', col, ': ', len(non_existing))
    if sum(non_unique > 1) > 0:
        multi_fundname = pd.DataFrame()
        index = list(non_unique[non_unique > 1].index)
        for i in index:
            multi_fundname = pd.concat([multi_fundname,
                                        fund_summary_US_Active.loc[(fund_summary_US_Active['crsp_cl_grp'] == i[0])
                                                                   & (fund_summary_US_Active['caldt'] == i[1]),
                                                                   ['crsp_fundno',  'crsp_cl_grp', 'caldt',
                                                                    'fund_name_short', 'fund_name_long']]])

### Dataset summary: Providing a description of the final dataset


In [None]:
print('The number of crsp_fundno present is:', len(fund_summary_US_Active.crsp_fundno.unique()))
print('The number of crsp_portnos present is:', len(fund_summary_US_Active.crsp_portno.unique()))
print('The number of crsp_cl_grp present is:', len(fund_summary_US_Active.crsp_cl_grp.unique()))
print('The number of wficn present is:', len(fund_summary_US_Active.wficn.unique()))

#### Frequency of categorical variables

In [None]:
categorical_variables = ['crsp_obj_cd', 'lipper_obj_cd']
for col in categorical_variables:
    print('Frequency of', col, 'in fund_summary_US_Active:')
    print(fund_summary_US_Active.drop_duplicates('crsp_cl_grp')[col].value_counts())

## Save Dataset

In [None]:
print("Saving the cleaned Active Equity dataset to file...")
fund_summary_US_Active.to_csv(outputPath + fund_summary_ActiveEq_name, index=False)
print('File saved!')