In [1]:
import pandas as pd
import math
import time
import numpy as np
from tqdm import tqdm 
import xlrd
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import datetime as dt
from scipy.stats.mstats import winsorize

pd.options.mode.chained_assignment = None  # default='warn'

PATH = "C:/Users/jackl/OneDrive/Documents/finance_research/japan_qe/"

In [2]:
def clean_dates(date): 
    date = str(date)
    if date=='0': 
        return math.nan
    elif date.isnumeric()==True:
        return pd.to_datetime(dt.datetime(*xlrd.xldate_as_tuple(int(date), 0)))
    elif date=='nan' or date=='None': 
        return math.nan
    else:
        return pd.to_datetime(date)

# Old Batch

In [4]:
#raw files on dropbox: jackliextradropbox@gmail.com, note name change (c1, c2, c3, c4)
files = ['ownership_pt1.pkl', 'ownership_pt2.pkl', 'ownership_pt3.pkl', 'ownership_pt4.pkl']

In [5]:
file_list = []
for file in tqdm(files):
    df = pd.read_pickle(PATH+'raw_data/capital_iq/'+file)
    print("Finish load:", file)
    
    #change columns
    df.columns = df.iloc[0]

    #set nan types
    df = df.astype(str)
    df = df.replace(['None', '(Invalid Identifier)', 'nan', 'NaN'], math.nan)

    #remove header rows
    df = df[~df['ticker'].str.contains('ticker')]

    #make sure holdings data is float
    df['holdings'] = df['holdings'].astype(float)
    df['pct'] = df['pct'].astype(float)

    #throw away observations where both holdings and pct are NaN/0
    df = df[~ ( (df['holdings'].isna()) & (df['pct'].isna()) )]
    df = df[~ ( (df['holdings']==0) & (df['pct']==0) )]

    #get dates
    df['date'] = df['date'].apply(lambda x: clean_dates(x))
    df['date'] = pd.to_datetime(df['date'])
    df['asof_date'] = pd.to_datetime(df['asof_date'])
    
    print('Done:', file)
    file_list.append(df)

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

Finish load: ownership_pt1.pkl


 25%|████████████████████▊                                                              | 1/4 [06:19<18:58, 379.56s/it]

Done: ownership_pt1.pkl
Finish load: ownership_pt2.pkl


 50%|█████████████████████████████████████████▌                                         | 2/4 [13:08<13:13, 396.70s/it]

Done: ownership_pt2.pkl
Finish load: ownership_pt3.pkl


 75%|██████████████████████████████████████████████████████████████▎                    | 3/4 [20:30<06:57, 417.72s/it]

Done: ownership_pt3.pkl
Finish load: ownership_pt4.pkl


100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [26:47<00:00, 401.81s/it]

Done: ownership_pt4.pkl





In [10]:
df_merged = pd.concat(file_list)

# Missed Companies

In [4]:
df = pd.read_pickle(PATH+'raw_data/capital_iq/missed_ownership.pkl')

In [7]:
#change columns
df.columns = df.iloc[0]

#set nan types
df = df.astype(str)
df = df.replace(['None', '(Invalid Identifier)', 'nan', 'NaN'], math.nan)

#remove header rows
df = df[~df['ticker'].str.contains('ticker')]

#make sure holdings data is float
df['holdings'] = df['holdings'].astype(float)
df['pct'] = df['pct'].astype(float)

#throw away observations where both holdings and pct are NaN/0
df = df[~ ( (df['holdings'].isna()) & (df['pct'].isna()) )]
df = df[~ ( (df['holdings']==0) & (df['pct']==0) )]

#get dates
df['date'] = df['date'].apply(lambda x: clean_dates(x))
df['date'] = pd.to_datetime(df['date'])
df['asof_date'] = pd.to_datetime(df['asof_date'])

In [13]:
df_full = pd.concat([df_merged, df])

In [22]:
df_full = df_full[ ~( (df_full['holdings']==0) & (df_full['pct'].isna()))]

In [None]:
df_full.to_pickle(PATH+'checkpoint_data/ownership_jan03.pkl')