In [1]:
import pandas as pd
import numpy as np
import datetime
pd.set_option('display.max_rows', 100)

In [2]:
funds_unfiltered = pd.read_csv('data/form13f.csv',sep=";")
 # preserve the original dataframe for now 

In [57]:
funds=funds_unfiltered.copy()

In [59]:
print("原始文件基金数量",len(funds["iCIK"].unique()))
# Convert date to Datetime
funds['iFILING_DATE']= pd.to_datetime(funds['iFILING_DATE'])
funds['iPERIOD_END']= pd.to_datetime(funds['iPERIOD_END'])


# Data Prcoessing requirement: Page46

funds = funds[funds['iMARKET_VALUE']!=0]
funds = funds[funds['iQTY'] != 0]
funds = funds.drop_duplicates()
funds = funds[(funds['iFILING_DATE'] >= datetime.datetime(2013, 6, 30)) & (funds['iPERIOD_END'] <= datetime.datetime(2018, 6, 30))]

print("筛完46页Data Processing还是剩基金数量:",len(funds["iCIK"].unique()))



# The fund must complete filings whintin 46d
funds['FILING_INTERVAL'] = funds['iFILING_DATE']-funds['iPERIOD_END']
funds = funds[funds['FILING_INTERVAL']<=datetime.timedelta(days = 45)] 

print("筛完45days 还剩下的基金数量:",len(funds["iCIK"].unique()))


#Fund Selection: Exhibit 3
#  One filing point for each quarter only, no amendament allowed
funds = funds[funds['iAMEND']== 0]
funds = funds[funds['iRESTATEMENT']== 0]
funds = funds[funds['iTYPE']== 0]
funds = funds.drop('iAMEND', axis=1)
funds = funds.drop('iRESTATEMENT', axis=1)
funds = funds.drop('iTYPE', axis=1)

print("筛完修改过的还剩下的基金数量:",len(funds["iCIK"].unique()))



#  The fund must have between 20-200 holdings at least one quarter
temp_df=funds.groupby(["iCIK","iPERIOD_END"]).count()
funds_names_20_200=temp_df[(temp_df["iCUSIP"]>=20)&(temp_df["iCUSIP"]<=200)].index.get_level_values(0).unique()
funds=funds[funds["iCIK"].isin(funds_names_20_200)]

print("筛完20到200只股票范围的还剩下的基金数量:",len(funds["iCIK"].unique()))

#  The fund must be valued between 100MM to 500MM

temp_df=funds.groupby(["iCIK","iPERIOD_END"]).sum()
funds_names_size=temp_df[(temp_df["iMARKET_VALUE"]>=100000000)&(temp_df["iMARKET_VALUE"]<=500000000)].index.get_level_values(0).unique()
funds=funds[funds["iCIK"].isin(funds_names_size)]
print("筛完基金规模还剩下的基金数量:",len(funds["iCIK"].unique()))

#  The fund must have lasted for a year
funds["year"]=funds["iPERIOD_END"].dt.year
temp_df=funds.groupby(["iCIK","year"])["iPERIOD_END"].apply(lambda quarters: quarters.nunique()).to_frame()
funds_names_4Qs = temp_df[temp_df["iPERIOD_END"]>=4].index.get_level_values(0).unique()
funds=funds[funds["iCIK"].isin(funds_names_4Qs)]
print("筛完“至少持续了4个季度”剩下的基金数量:",len(funds["iCIK"].unique()))



原始文件基金数量 9540
筛完46页Data Processing还是剩基金数量: 6034
筛完45days 还剩下的基金数量: 5938
筛完修改过的还剩下的基金数量: 5932
筛完20到200只股票范围的还剩下的基金数量: 3950
筛完基金规模还剩下的基金数量: 2944
筛完“至少持续了4个季度”剩下的基金数量: 1989


In [66]:
prices = pd.read_csv('data/prices.csv',sep=";")

#reducing "prices" dataframe size by selecting only revelant stocks
prices=prices[prices["pSP_CUSIP"].isin(funds["iCUSIP"])]
prices['pSP_DATE']= pd.to_datetime(prices['pSP_DATE'])
#reducing "prices" dataframe size by selecting only revelant dates
prices = prices[(prices['pSP_DATE'] >= datetime.datetime(2013, 6, 30)) & (prices['pSP_DATE'] <= datetime.datetime(2018, 6, 30))]
#Sort the row according to: Funds, Stock, Time
funds.sort_values(by=["iCIK","iCUSIP","iPERIOD_END"],ascending=True,inplace=True)

Feature X20, X21

In [67]:
look_up = prices[["pSP_CUSIP","pSP_DATE","pSP_CLOSE"]]
funds=funds.merge(look_up, left_on=["iCUSIP","iPERIOD_END"], right_on=["pSP_CUSIP","pSP_DATE"])
funds.drop(columns=["pSP_CUSIP","pSP_DATE"],inplace=True)
funds["mar_Cap"] = funds["iQTY"] * funds["pSP_CLOSE"]


#Change in Market Capital between quarters
funds["mar_Cap_change"]=funds.groupby(["iCIK","iCUSIP"])["mar_Cap"].shift()
funds["mar_Cap_change"] = funds["mar_Cap"]- funds["mar_Cap_change"]

#Change in Quantity between quarters
funds["quantity_change"]=funds.groupby(["iCIK","iCUSIP"])["iQTY"].shift()
funds["quantity_change"] = funds["iQTY"]- funds["quantity_change"]


In [69]:
#Test the result

funds[(funds["iCUSIP"]=="009158106") & (funds["iCIK"]==1388168)].sort_values(by="iPERIOD_END")

Unnamed: 0,iRECORD_ID,iCIK,iCUSIP,iPERIOD_END,iFILING_DATE,iQTY,iMARKET_VALUE,iLONG_FRACTION,FILING_INTERVAL,year,pSP_CLOSE,mar_Cap,mar_Cap_change,quantity_change
93,40550748,1388168,9158106,2013-09-30,2013-10-31,3000.0,320000.0,0.002398,31 days,2013,99.127,297381.0,,
223,49104273,1388168,9158106,2013-12-31,2014-01-24,3000.0,335000.0,0.002345,24 days,2013,103.973,311919.0,14538.0,0.0
357,49165434,1388168,9158106,2014-03-31,2014-05-07,3000.0,357000.0,0.001924,37 days,2014,110.726,332178.0,20259.0,0.0
658,47596831,1388168,9158106,2014-09-30,2014-11-12,3020.0,393000.0,0.001457,43 days,2014,121.088,365685.76,33507.76,20.0
818,50946413,1388168,9158106,2014-12-31,2015-01-30,3020.0,436000.0,0.00118,30 days,2014,134.157,405154.14,39468.38,0.0
982,52013392,1388168,9158106,2015-03-31,2015-04-20,3020.0,457000.0,0.001228,20 days,2015,140.714,424956.28,19802.14,0.0
1157,54020721,1388168,9158106,2015-06-30,2015-08-11,2220.0,329000.0,0.001284,42 days,2015,127.274,282548.28,-142408.0,-800.0
1327,54457163,1388168,9158106,2015-09-30,2015-11-05,2220.0,283000.0,0.001218,36 days,2015,118.67,263447.4,-19100.88,0.0
1470,55354162,1388168,9158106,2015-12-31,2016-01-28,2220.0,262000.0,0.001673,28 days,2015,121.023,268671.06,5223.66,0.0
1614,56256235,1388168,9158106,2016-03-31,2016-04-11,2220.0,320000.0,0.001339,11 days,2016,133.989,297455.58,28784.52,0.0


Feature X16,X17,X18

In [12]:
def annualize_rets(r):
    compounded_growth = (1+r).prod()
    n_periods = r.shape[0]
    return (compounded_growth)**(360/n_periods)-1  # Assume one year has 360 days

In [8]:
# get returns from prices
prices["return"] = prices.groupby("pSP_CUSIP")["pSP_CLOSE"].pct_change()

In [23]:
# Features 16,17,and 18 (slow)
prices["30D"]=prices.groupby(["pSP_CUSIP"],as_index= False)["return"].rolling(window=30).aggregate(annualize_rets)["return"]

In [30]:
prices["60D"]=prices.groupby(["pSP_CUSIP"],as_index= False)["return"].rolling(window=60).aggregate(annualize_rets)["return"]

In [31]:
prices["90D"]=prices.groupby(["pSP_CUSIP"],as_index= False)["return"].rolling(window=90).aggregate(annualize_rets)["return"]

In [38]:
#Test the result
prices[prices["pSP_CUSIP"]=="000307108"].head(100)

Unnamed: 0,pSP_CUSIP,pSP_EXCHANGE,pSP_TICKER,pSP_DATE,pSP_VOLUME,pSP_OPEN,pSP_HIGH,pSP_LOW,pSP_CLOSE,return,30D,60D,90D
456236,307108,N,AAC,2015-07-17,282904,43.49,43.8255,40.6,41.84,,,,
995963,307108,N,AAC,2015-04-16,93566,31.62,32.29,31.18,31.93,-0.236855,,,
1096725,307108,N,AAC,2015-04-14,149948,31.27,31.4,30.78,31.11,-0.025681,,,
1889316,307108,N,AAC,2015-04-17,96507,31.65,31.87,31.5,31.62,0.016393,,,
2113033,307108,N,AAC,2015-02-02,63592,25.71,25.71,25.0,25.59,-0.190702,,,
6690249,307108,N,AAC,2014-12-16,88336,29.77,30.57,29.06,29.4,0.148886,,,
7071264,307108,N,AAC,2015-04-15,252585,31.0,32.79,30.94,32.02,0.089116,,,
9589671,307108,N,AAC,2015-04-13,106714,32.27,32.27,30.91,31.16,-0.026858,,,
10603225,307108,N,AAC,2014-10-13,52514,18.92,19.1,18.6401,19.0,-0.390244,,,
12942546,307108,N,AAC,2014-10-16,61787,19.13,19.85,19.0,19.4,0.021053,,,
