# Creating Timeseries from ownership data

### This notebook will join together the monthly ownership data from 2022-2024 into  time-series with 36 monthly time stamps

In [38]:
from pathlib import Path
import sys

project_dir = Path().resolve().parent.parent

sys.path.append(str(project_dir))
from config import PATHS
import pandas as pd
import numpy as np

In [39]:
ownership_path = PATHS.ownership_data

In [40]:
data_dict = {}
for file in ownership_path.glob("*.txt"):
    date_part = file.stem.split("-")[-2] + "_" + file.stem.split("-")[-1]
    variable_name = f"data_{date_part}"

    df = pd.read_csv(file, sep="|")  

    data_dict[variable_name] = df

In [41]:
combined_data = pd.concat(data_dict.values())
combined_data.sort_values(by=["Code", "Date"], inplace=True)
combined_data["Date"] = pd.to_datetime(combined_data["Date"], format="%d-%b-%Y")

In [42]:
grouped_data = {code: group for code, group in combined_data.groupby("Code")}



In [43]:
for code, df in grouped_data.items():
    grouped_data[code] = df.sort_values(by="Date").reset_index(drop=True)

In [44]:
print(len(grouped_data))

5068


In [46]:
grouped_data['BBCA'].head()

Unnamed: 0,Date,Code,Type,Sec. Num,Price,Local IS,Local CP,Local PF,Local IB,Local ID,Local MF,Local SC,Local FD,Local OT,Total,Foreign IS,Foreign CP,Foreign PF,Foreign IB,Foreign ID,Foreign MF,Foreign SC,Foreign FD,Foreign OT,Total.1
0,2022-01-31,BBCA,EQUITY,123275000000.0,7625,2059560295,504944840,372760450,20276200,4134785105,1717340033,54803506,58818900,1775515,8925064844,626214998,2340007472,6358245202,3805632439,420481970,18477561518,1392473728,699503832,9858510117,43978631276
1,2022-02-25,BBCA,EQUITY,123275000000.0,8050,2047160895,503959140,366998050,37051000,4098164481,1555544447,53973618,57807800,1775515,8722434946,628510990,2328850243,6314962239,3791686233,420469270,18564008130,1408667529,699727532,10024379008,44181261174
2,2022-03-31,BBCA,EQUITY,123275000000.0,7975,1907679295,508526740,363391350,821000,4311821636,1489359751,64951166,56593800,2012515,8705157253,632643828,2258195599,6194151389,3839129755,420395470,18699137384,1350477926,696297632,10108109884,44198538867
3,2022-04-28,BBCA,EQUITY,123275000000.0,8125,1671213795,514286340,350131350,820000,4355117784,1372563289,67675549,56375400,1815015,8389998522,644886119,2228921005,6161211372,3795578622,420946070,19074973994,1348503690,696328132,10142348594,44513697598
4,2022-05-31,BBCA,EQUITY,123275000000.0,7750,1694796895,525070740,371802250,820000,4724008145,1458293436,75467095,52677600,2176815,8905112976,632161867,2213974494,6143623577,3840400876,321541770,18498224124,1326351437,690333132,10331971867,43998583144


In [47]:
#output_directory = PATHS.outputs/"ownership-data-timeseries"
#for code, df in grouped_data.items():
#    df.to_csv(f"{output_directory}/{code}_time_series.csv")