# Revise Turtle Map

In [2]:
import pandas as pd
import re

In [3]:
df = pd.read_csv('core/turtle_map.csv')
df

Unnamed: 0,regulatory,ttl_file
0,Amandemen_1_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahanp...
1,Amandemen_2_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
2,Amandemen_3_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
3,Amandemen_4_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
4,Kepmen_Hut_2002_8171,new_turtle_filesbn2008bn23-2008.ttl
...,...,...
24968,UU_2019_4,new_turtle_filesln2019uu4-2019bt.ttl
24969,UU_2019_5,new_turtle_filesln2019uu5-2019bt.ttl
24970,UU_2019_7,new_turtle_filesln2019uu7-2019bt.ttl
24971,UU_2019_8,new_turtle_filesln2019uu8-2019bt.ttl


In [11]:
filtered_df = df[~df['ttl_file'].str.contains('ningg')]
filtered_df

Unnamed: 0,regulatory,ttl_file
0,Amandemen_1_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahanp...
1,Amandemen_2_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
2,Amandemen_3_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
3,Amandemen_4_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
4,Kepmen_Hut_2002_8171,new_turtle_filesbn2008bn23-2008.ttl
...,...,...
24968,UU_2019_4,new_turtle_filesln2019uu4-2019bt.ttl
24969,UU_2019_5,new_turtle_filesln2019uu5-2019bt.ttl
24970,UU_2019_7,new_turtle_filesln2019uu7-2019bt.ttl
24971,UU_2019_8,new_turtle_filesln2019uu8-2019bt.ttl


In [4]:
def replace_strings_with_regex(string):
    pattern = r'(?P<root>new_turtle_files)(?P<prefix>bn|lain-lain|ln|perda|putusan)(?P<year>\d{4})'
    modified_string = re.sub(pattern, r'\g<root>/\g<prefix>/\g<year>/', string)
    return modified_string

In [5]:
modified_df = filtered_df.copy()
modified_df['ttl_file'] = modified_df['ttl_file'].apply(lambda x: replace_strings_with_regex(x))
modified_df

Unnamed: 0,regulatory,ttl_file
0,Amandemen_1_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_turtle_files/bn/2008/bn23-2008.ttl
...,...,...
24968,UU_2019_4,new_turtle_files/ln/2019/uu4-2019bt.ttl
24969,UU_2019_5,new_turtle_files/ln/2019/uu5-2019bt.ttl
24970,UU_2019_7,new_turtle_files/ln/2019/uu7-2019bt.ttl
24971,UU_2019_8,new_turtle_files/ln/2019/uu8-2019bt.ttl


In [6]:
modified_df.to_csv("core/turtle_map_fin.csv", index=False)

# Ingest Turtle File Into Regulatory Map

In [7]:
df_1 = pd.read_csv('core/regulatory_map.csv')
df_2 = pd.read_csv('core/turtle_map_fin.csv')

df_1['a'] = df_1['regulatory'].str.lower()
df_2['a'] = df_2['regulatory'].str.lower()
df_2['file_ttl'] = df_2['ttl_file']

df_1 = df_1.drop('regulatory', axis=1)
df_2 = df_2.drop('ttl_file', axis=1)

merged_df = pd.merge(df_2, df_1, on='a', how='left')
merged_df = merged_df.drop('a', axis=1)

merged_df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Amandemen_1_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_turtle_files/bn/2008/bn23-2008.ttl,turtle_files/bn/2008/bn29-2008.ttl,new_1_text_files/bn/2008/bn23-2008.txt,new_parsed_files/bn/2008/bn23-2008.json
...,...,...,...,...,...
25449,UU_2019_4,new_turtle_files/ln/2019/uu4-2019bt.ttl,turtle_files/bn/2019/bn_1235-2019.ttl,new_1_text_files/ln/2019/uu4-2019bt.txt,new_parsed_files/ln/2019/uu4-2019bt.json
25450,UU_2019_5,new_turtle_files/ln/2019/uu5-2019bt.ttl,turtle_files/ln/2019/pp8-2019bt.ttl,new_1_text_files/ln/2019/uu5-2019bt.txt,new_parsed_files/ln/2019/uu5-2019bt.json
25451,UU_2019_7,new_turtle_files/ln/2019/uu7-2019bt.ttl,turtle_files/bn/2019/bn662-2019.ttl,new_1_text_files/ln/2019/uu7-2019bt.txt,new_parsed_files/ln/2019/uu7-2019bt.json
25452,UU_2019_8,new_turtle_files/ln/2019/uu8-2019bt.ttl,turtle_files/bn/2019/bn_567-2019.ttl,new_1_text_files/ln/2019/uu8-2019bt.txt,new_parsed_files/ln/2019/uu8-2019bt.json


In [13]:
with open('core/error.txt', 'r') as file:
    error_files = file.read().splitlines()
    
error_files

['new_parsed_files/bn/2015/bn1947-2015.json',
 'new_parsed_files/bn/2015/bn1309-2015.json',
 'new_parsed_files/bn/2015/bn498-2015.json',
 'new_parsed_files/ln/2018/ps140-2018.json',
 'new_parsed_files/ln/1960/pp0271960.json',
 'new_parsed_files/ln/1970/pp0061970.json',
 'new_parsed_files/ln/2016/pbi18-6-2016bt.json',
 'new_parsed_files/bn/2010/bn361-2010.json',
 'new_parsed_files/bn/2011/bn808-2011.json',
 'new_parsed_files/bn/2012/bn470-2012.json',
 'new_parsed_files/bn/2012/bn623-2012.json',
 'new_parsed_files/bn/2012/bn471-2012.json',
 'new_parsed_files/perda/2014/perda_kabupaten_karawang_nomor_12_tahun_2014_11e586907aaaabae811c313031373539.json',
 'new_parsed_files/perda/2013/perda_kabupaten_rokan_hulu_nomor_2_tahun_2013_11e568b63946532c894d313033323335.json',
 'new_parsed_files/perda/2012/perda_kota_bandung_nomor_10_tahun_2012_11e57639e0be146a8fe4313531373436.json',
 'new_parsed_files/bn/2015/bn1120-2015.json',
 'new_parsed_files/bn/2011/bn388-2011.json',
 'new_parsed_files/bn/201

In [24]:
duplicates = merged_df['regulatory'].value_counts()[lambda x: x > 1].index
filtered_merged_df = merged_df[merged_df['regulatory'].isin(duplicates)]
filtered_merged_df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
10,PB_MA_Menhumham_Menkes_Mensos_Jagung_Kapolri_B...,new_turtle_files/bn/2014/bn465-2014.ttl,turtle_files/ln/2014/ps103-2014.ttl,new_1_text_files/bn/2014/pb_bn465-2014.txt,new_parsed_files/bn/2014/pb_bn465-2014.json
11,PB_MA_Menhumham_Menkes_Mensos_Jagung_Kapolri_B...,new_turtle_files/bn/2014/bn465-2014.ttl,turtle_files/ln/2014/ps103-2014.ttl,new_1_text_files/bn/2014/bn465-2014.txt,new_parsed_files/bn/2014/bn465-2014.json
39,PERKAB_BANGKA_TENGAH_2011_2,new_turtle_files/perda/2011/perda_kabupaten_ba...,turtle_files/perda/2011/perda_kabupaten_bangka...,new_1_text_files/perda/2011/perda_kabupaten_ba...,new_parsed_files/perda/2011/perda_kabupaten_ba...
40,PERKAB_BANGKA_TENGAH_2011_2,new_turtle_files/perda/2011/perda_kabupaten_ba...,turtle_files/perda/2011/perda_kabupaten_bangka...,new_1_text_files/perda/2011/perda_kabupaten_ba...,new_parsed_files/perda/2011/perda_kabupaten_ba...
45,PERKAB_BANTUL_2001_4,new_turtle_files/perda/2001/perda_kabupaten_ba...,turtle_files/perda/2001/perda_kabupaten_solok_...,new_1_text_files/perda/2001/perda_kabupaten_ba...,new_parsed_files/perda/2001/perda_kabupaten_ba...
...,...,...,...,...,...
24772,UU_1990_8,new_turtle_files/ln/1990/uu9-1990.ttl,turtle_files/ln/1990/pp14-1990.ttl,new_1_text_files/ln/1990/uu9-1990.txt,new_parsed_files/ln/1990/uu9-1990.json
24843,UU_1997_12,new_turtle_files/ln/1997/uu11-1997.ttl,turtle_files/ln/1997/pp16-1997.ttl,new_1_text_files/ln/1997/uu12-1997.txt,new_parsed_files/ln/1997/uu12-1997.json
24844,UU_1997_12,new_turtle_files/ln/1997/uu11-1997.ttl,turtle_files/ln/1997/pp16-1997.ttl,new_1_text_files/ln/1997/uu11-1997.txt,new_parsed_files/ln/1997/uu11-1997.json
25110,UU_2005_13,new_turtle_files/ln/2005/uu13-2005.ttl,turtle_files/ln/2005/pp18-2005.ttl,new_1_text_files/ln/2006/uu14-2006.txt,new_parsed_files/ln/2006/uu14-2006.json


In [25]:
duplicates = merged_df['regulatory'].value_counts()[lambda x: x > 1].index
filtered_merged_df = merged_df[~merged_df['regulatory'].isin(duplicates)]
filtered_merged_df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Amandemen_1_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_turtle_files/bn/2008/bn23-2008.ttl,turtle_files/bn/2008/bn29-2008.ttl,new_1_text_files/bn/2008/bn23-2008.txt,new_parsed_files/bn/2008/bn23-2008.json
...,...,...,...,...,...
25449,UU_2019_4,new_turtle_files/ln/2019/uu4-2019bt.ttl,turtle_files/bn/2019/bn_1235-2019.ttl,new_1_text_files/ln/2019/uu4-2019bt.txt,new_parsed_files/ln/2019/uu4-2019bt.json
25450,UU_2019_5,new_turtle_files/ln/2019/uu5-2019bt.ttl,turtle_files/ln/2019/pp8-2019bt.ttl,new_1_text_files/ln/2019/uu5-2019bt.txt,new_parsed_files/ln/2019/uu5-2019bt.json
25451,UU_2019_7,new_turtle_files/ln/2019/uu7-2019bt.ttl,turtle_files/bn/2019/bn662-2019.ttl,new_1_text_files/ln/2019/uu7-2019bt.txt,new_parsed_files/ln/2019/uu7-2019bt.json
25452,UU_2019_8,new_turtle_files/ln/2019/uu8-2019bt.ttl,turtle_files/bn/2019/bn_567-2019.ttl,new_1_text_files/ln/2019/uu8-2019bt.txt,new_parsed_files/ln/2019/uu8-2019bt.json


In [26]:
filtered_merged_df = filtered_merged_df[~filtered_merged_df['file_json'].isin(error_files)]
filtered_merged_df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Amandemen_1_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_turtle_files/bn/2008/bn23-2008.ttl,turtle_files/bn/2008/bn29-2008.ttl,new_1_text_files/bn/2008/bn23-2008.txt,new_parsed_files/bn/2008/bn23-2008.json
...,...,...,...,...,...
25449,UU_2019_4,new_turtle_files/ln/2019/uu4-2019bt.ttl,turtle_files/bn/2019/bn_1235-2019.ttl,new_1_text_files/ln/2019/uu4-2019bt.txt,new_parsed_files/ln/2019/uu4-2019bt.json
25450,UU_2019_5,new_turtle_files/ln/2019/uu5-2019bt.ttl,turtle_files/ln/2019/pp8-2019bt.ttl,new_1_text_files/ln/2019/uu5-2019bt.txt,new_parsed_files/ln/2019/uu5-2019bt.json
25451,UU_2019_7,new_turtle_files/ln/2019/uu7-2019bt.ttl,turtle_files/bn/2019/bn662-2019.ttl,new_1_text_files/ln/2019/uu7-2019bt.txt,new_parsed_files/ln/2019/uu7-2019bt.json
25452,UU_2019_8,new_turtle_files/ln/2019/uu8-2019bt.ttl,turtle_files/bn/2019/bn_567-2019.ttl,new_1_text_files/ln/2019/uu8-2019bt.txt,new_parsed_files/ln/2019/uu8-2019bt.json


In [27]:
null_counts = filtered_merged_df.isnull().sum()
print("Count of null values for each column:")
print(null_counts)

Count of null values for each column:
regulatory    0
file_ttl      0
ttl_file      0
file_txt      1
file_json     1
dtype: int64


In [13]:
filtered_merged_df = filtered_merged_df[filtered_merged_df['file_json'].isin(error_files)]
filtered_merged_df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
284,PP_1960_27,new_turtle_files/ln/1962/pp0271962.ttl,turtle_files/ln/1960/pp0441960.ttl,new_1_text_files/ln/1960/pp0271960.txt,new_parsed_files/ln/1960/pp0271960.json
563,PP_1970_6,new_turtle_files/ln/1970/pp0461970.ttl,turtle_files/ln/1970/pp0551970.ttl,new_1_text_files/ln/1970/pp0061970.txt,new_parsed_files/ln/1970/pp0061970.json
3082,Peraturan_BI_2016_18,new_turtle_files/ln/2016/pbi18-40-2016bt.ttl,turtle_files/bn/2017/bn219-2017.ttl,new_1_text_files/ln/2016/pbi18-6-2016bt.txt,new_parsed_files/ln/2016/pbi18-6-2016bt.json
4033,Peraturan_BPOM_2010_03,new_turtle_files/bn/2010/bn470-2010.ttl,turtle_files/bn/2010/bn735-2010.ttl,new_1_text_files/bn/2010/bn361-2010.txt,new_parsed_files/bn/2010/bn361-2010.json
4061,Peraturan_BPOM_2011_03,new_turtle_files/bn/2011/bn393-2011.ttl,turtle_files/bn/2011/bn40-2011.ttl,new_1_text_files/bn/2011/bn808-2011.txt,new_parsed_files/bn/2011/bn808-2011.json
4068,Peraturan_BPOM_2012_03,new_turtle_files/bn/2013/bn122-2013.ttl,turtle_files/bn/2012/bn397-2012.ttl,new_1_text_files/bn/2012/bn470-2012.txt,new_parsed_files/bn/2012/bn470-2012.json
4071,Peraturan_BPOM_2012_03,new_turtle_files/bn/2013/bn122-2013.ttl,turtle_files/bn/2012/bn397-2012.ttl,new_1_text_files/bn/2012/bn623-2012.txt,new_parsed_files/bn/2012/bn623-2012.json
4072,Peraturan_BPOM_2012_03,new_turtle_files/bn/2013/bn122-2013.ttl,turtle_files/bn/2012/bn397-2012.ttl,new_1_text_files/bn/2012/bn471-2012.txt,new_parsed_files/bn/2012/bn471-2012.json
8755,Perkab_Siak_2013_2,new_turtle_files/perda/2013/perda_kabupaten_si...,turtle_files/perda/2013/perda_kabupaten_rokan_...,new_1_text_files/perda/2013/perda_kabupaten_ro...,new_parsed_files/perda/2013/perda_kabupaten_ro...
12286,Permen_Dagri_2017_4,new_turtle_files/bn/2018/bn173-2018.ttl,turtle_files/bn/2018/bn186-2018.ttl,new_1_text_files/bn/2017/bn196-2017.txt,new_parsed_files/bn/2017/bn196-2017.json


In [12]:
duplicates = merged_df['regulatory'].value_counts()[lambda x: x > 1].index
filtered_merged_df = merged_df[~merged_df['regulatory'].isin(duplicates)]
filtered_merged_df = filtered_merged_df[~filtered_merged_df['file_json'].isin(error_files)]
filtered_merged_df.dropna(inplace=True)
filtered_merged_df.reset_index(drop=True)

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Amandemen_1_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_turtle_files/bn/2008/bn23-2008.ttl,turtle_files/bn/2008/bn29-2008.ttl,new_1_text_files/bn/2008/bn23-2008.txt,new_parsed_files/bn/2008/bn23-2008.json
...,...,...,...,...,...
24325,UU_2019_4,new_turtle_files/ln/2019/uu4-2019bt.ttl,turtle_files/bn/2019/bn_1235-2019.ttl,new_1_text_files/ln/2019/uu4-2019bt.txt,new_parsed_files/ln/2019/uu4-2019bt.json
24326,UU_2019_5,new_turtle_files/ln/2019/uu5-2019bt.ttl,turtle_files/ln/2019/pp8-2019bt.ttl,new_1_text_files/ln/2019/uu5-2019bt.txt,new_parsed_files/ln/2019/uu5-2019bt.json
24327,UU_2019_7,new_turtle_files/ln/2019/uu7-2019bt.ttl,turtle_files/bn/2019/bn662-2019.ttl,new_1_text_files/ln/2019/uu7-2019bt.txt,new_parsed_files/ln/2019/uu7-2019bt.json
24328,UU_2019_8,new_turtle_files/ln/2019/uu8-2019bt.ttl,turtle_files/bn/2019/bn_567-2019.ttl,new_1_text_files/ln/2019/uu8-2019bt.txt,new_parsed_files/ln/2019/uu8-2019bt.json


In [13]:
filtered_merged_df.to_csv("core/regulatory_map_fin.csv", index=False)

# Ingest Turtle File (Surface Info) Into Regulatory Map

In [14]:
df = pd.read_csv('core/regulatory_map_fin.csv')
df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Amandemen_1_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_turtle_files/bn/2008/bn23-2008.ttl,turtle_files/bn/2008/bn29-2008.ttl,new_1_text_files/bn/2008/bn23-2008.txt,new_parsed_files/bn/2008/bn23-2008.json
...,...,...,...,...,...
24325,UU_2019_4,new_turtle_files/ln/2019/uu4-2019bt.ttl,turtle_files/bn/2019/bn_1235-2019.ttl,new_1_text_files/ln/2019/uu4-2019bt.txt,new_parsed_files/ln/2019/uu4-2019bt.json
24326,UU_2019_5,new_turtle_files/ln/2019/uu5-2019bt.ttl,turtle_files/ln/2019/pp8-2019bt.ttl,new_1_text_files/ln/2019/uu5-2019bt.txt,new_parsed_files/ln/2019/uu5-2019bt.json
24327,UU_2019_7,new_turtle_files/ln/2019/uu7-2019bt.ttl,turtle_files/bn/2019/bn662-2019.ttl,new_1_text_files/ln/2019/uu7-2019bt.txt,new_parsed_files/ln/2019/uu7-2019bt.json
24328,UU_2019_8,new_turtle_files/ln/2019/uu8-2019bt.ttl,turtle_files/bn/2019/bn_567-2019.ttl,new_1_text_files/ln/2019/uu8-2019bt.txt,new_parsed_files/ln/2019/uu8-2019bt.json


In [15]:
with open('error2.txt', 'r') as file:
    error_files = file.read().splitlines()
    
error_files

['UU_2007_39 new_turtle_files/ln/2007/uu39-2007.ttl',
 'UU_2001_20 new_turtle_files/ln/2001/uu20-2001.ttl',
 'Perprov_Sumsel_2014_1 new_turtle_files/perda/2014/perda_provinsi_sumatera_selatan_nomor_1_tahun_2014_11e4e8bf8e941cdca11c313431363535.ttl',
 'Perprov_Sumsel_2011_2 new_turtle_files/perda/2011/perda_provinsi_sumatera_selatan_nomor_2_tahun_2011_11e452bda17a20c08e56313634353133.ttl',
 'Perprov_Sumbar_2011_12 new_turtle_files/perda/2011/perda_provinsi_sumatera_barat_nomor_12_tahun_2011_11e452bdaac74430adee313634353239.ttl',
 'Perprov_Banten_2009_3 new_turtle_files/perda/2009/perda_provinsi_banten_nomor_3_tahun_2009_11e452bd826b2d80b2fa313634343231.ttl',
 'Permen_Keu_2019_87 new_turtle_files/bn/2019/bn641-2019.ttl',
 'Permen_Keu_2010_165 new_turtle_files/bn/2010/bn442-2010.ttl',
 'Permen_Keu_2009_178 new_turtle_files/bn/2009/bn434-2009.ttl',
 'Permen_KP_2012_20 new_turtle_files/bn/2012/bn1033-2012.ttl',
 'Permen_Hut_2012_33 new_turtle_files/bn/2012/bn779-2012.ttl',
 'Permen_Humham_2

In [16]:
error_files = [error.split(' ')[0] for error in error_files]
error_files

['UU_2007_39',
 'UU_2001_20',
 'Perprov_Sumsel_2014_1',
 'Perprov_Sumsel_2011_2',
 'Perprov_Sumbar_2011_12',
 'Perprov_Banten_2009_3',
 'Permen_Keu_2019_87',
 'Permen_Keu_2010_165',
 'Permen_Keu_2009_178',
 'Permen_KP_2012_20',
 'Permen_Hut_2012_33',
 'Permen_Humham_2016_67',
 'Permen_Humham_2014_27',
 'Permen_Hub_2015_187',
 'Permen_ESDM_2015_43',
 'Permen_Dag_2009_21',
 'Permen_Agama_2015_59',
 'Permen_Agama_2012_13',
 'Permen_Agama_2011_8',
 'Permen_Agama_2011_16',
 'Permen_Agama_2007_30',
 'Permen_Agama_2007_29',
 'Permen_Agama_2007_28',
 'Permen_Agama_2007_27',
 'Perkot_Tasikmalaya_2007_3',
 'Perkot_Depok_2009_2',
 'Perkab_Sleman_2013_1',
 'Perkab_Sinjai_2012_35',
 'Perkab_Purbalingga_2012_14',
 'Perkab_Magelang_2010_15',
 'Perkab_Jepara_2012_2',
 'Perkab_Jepara_2010_9',
 'Perkab_Garut_2012_11',
 'Perkab_Bantul_2007_08',
 'Perkab_Bantul_2001_23',
 'Perkab_Banggai_2009_9',
 'PP_1983_13']

In [17]:
df = df[~df['regulatory'].isin(error_files)]
df.dropna(inplace=True)
df.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)


Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Amandemen_1_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_turtle_files/lain-lain/1945/uud1945perubah...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_turtle_files/bn/2008/bn23-2008.ttl,turtle_files/bn/2008/bn29-2008.ttl,new_1_text_files/bn/2008/bn23-2008.txt,new_parsed_files/bn/2008/bn23-2008.json
...,...,...,...,...,...
24288,UU_2019_4,new_turtle_files/ln/2019/uu4-2019bt.ttl,turtle_files/bn/2019/bn_1235-2019.ttl,new_1_text_files/ln/2019/uu4-2019bt.txt,new_parsed_files/ln/2019/uu4-2019bt.json
24289,UU_2019_5,new_turtle_files/ln/2019/uu5-2019bt.ttl,turtle_files/ln/2019/pp8-2019bt.ttl,new_1_text_files/ln/2019/uu5-2019bt.txt,new_parsed_files/ln/2019/uu5-2019bt.json
24290,UU_2019_7,new_turtle_files/ln/2019/uu7-2019bt.ttl,turtle_files/bn/2019/bn662-2019.ttl,new_1_text_files/ln/2019/uu7-2019bt.txt,new_parsed_files/ln/2019/uu7-2019bt.json
24291,UU_2019_8,new_turtle_files/ln/2019/uu8-2019bt.ttl,turtle_files/bn/2019/bn_567-2019.ttl,new_1_text_files/ln/2019/uu8-2019bt.txt,new_parsed_files/ln/2019/uu8-2019bt.json


In [18]:
def replace_strings_with_regex(string):
    pattern = r'(?P<root>new_turtle_files)'
    modified_string = re.sub(pattern, 'new_2_turtle_files', string)
    return modified_string

In [19]:
df['file_ttl'] = df['file_ttl'].apply(lambda x: replace_strings_with_regex(x))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['file_ttl'] = df['file_ttl'].apply(lambda x: replace_strings_with_regex(x))


Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Amandemen_1_UUD_1945,new_2_turtle_files/lain-lain/1945/uud1945perub...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
1,Amandemen_2_UUD_1945,new_2_turtle_files/lain-lain/1945/uud1945perub...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
2,Amandemen_3_UUD_1945,new_2_turtle_files/lain-lain/1945/uud1945perub...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
3,Amandemen_4_UUD_1945,new_2_turtle_files/lain-lain/1945/uud1945perub...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
4,Kepmen_Hut_2002_8171,new_2_turtle_files/bn/2008/bn23-2008.ttl,turtle_files/bn/2008/bn29-2008.ttl,new_1_text_files/bn/2008/bn23-2008.txt,new_parsed_files/bn/2008/bn23-2008.json
...,...,...,...,...,...
24325,UU_2019_4,new_2_turtle_files/ln/2019/uu4-2019bt.ttl,turtle_files/bn/2019/bn_1235-2019.ttl,new_1_text_files/ln/2019/uu4-2019bt.txt,new_parsed_files/ln/2019/uu4-2019bt.json
24326,UU_2019_5,new_2_turtle_files/ln/2019/uu5-2019bt.ttl,turtle_files/ln/2019/pp8-2019bt.ttl,new_1_text_files/ln/2019/uu5-2019bt.txt,new_parsed_files/ln/2019/uu5-2019bt.json
24327,UU_2019_7,new_2_turtle_files/ln/2019/uu7-2019bt.ttl,turtle_files/bn/2019/bn662-2019.ttl,new_1_text_files/ln/2019/uu7-2019bt.txt,new_parsed_files/ln/2019/uu7-2019bt.json
24328,UU_2019_8,new_2_turtle_files/ln/2019/uu8-2019bt.ttl,turtle_files/bn/2019/bn_567-2019.ttl,new_1_text_files/ln/2019/uu8-2019bt.txt,new_parsed_files/ln/2019/uu8-2019bt.json


In [20]:
df.to_csv("core/regulatory_map_surface_info.csv", index=False)

# Cek

In [34]:
d = pd.read_csv('core/regulatory_list.csv')
d = d.drop_duplicates(subset='regulatory', keep=False)

df = pd.read_csv('core/turtle_map.csv')
filtered_df = df[~df['ttl_file'].str.contains('ningg')]
filtered_df

Unnamed: 0,regulatory,ttl_file
0,Amandemen_1_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahanp...
1,Amandemen_2_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
2,Amandemen_3_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
3,Amandemen_4_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
4,Kepmen_Hut_2002_8171,new_turtle_filesbn2008bn23-2008.ttl
...,...,...
24968,UU_2019_4,new_turtle_filesln2019uu4-2019bt.ttl
24969,UU_2019_5,new_turtle_filesln2019uu5-2019bt.ttl
24970,UU_2019_7,new_turtle_filesln2019uu7-2019bt.ttl
24971,UU_2019_8,new_turtle_filesln2019uu8-2019bt.ttl


In [35]:
d = pd.read_csv('core/regulatory_list.csv')
d = d.drop_duplicates(subset='regulatory', keep=False)

df = pd.read_csv('core/turtle_map.csv')
filtered_df = df[~df['ttl_file'].str.contains('ningg')]

d['a'] = d['regulatory'].str.lower()
filtered_df['a'] = filtered_df['regulatory'].str.lower()

merged_df = pd.merge(d, filtered_df, on='a', how='inner')
merged_df = merged_df[['regulatory_x', 'ttl_file']]
merged_df.columns = ['regulatory', 'ttl_file']
merged_df.reset_index(drop=True, inplace=True)
merged_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['a'] = filtered_df['regulatory'].str.lower()


Unnamed: 0,regulatory,ttl_file
0,Perkab_Bantul_1994_1,new_turtle_filesperda1994perda_kabupaten_bantu...
1,Perkab_Bantul_1994_13,new_turtle_filesperda1994perda_kabupaten_bantu...
2,Perkab_Bantul_1994_5,new_turtle_filesperda1994perda_kabupaten_bantu...
3,Perkab_Bantul_1994_11,new_turtle_filesperda1994perda_kabupaten_bantu...
4,Perkab_Bantul_1994_7,new_turtle_filesperda1994perda_kabupaten_bantu...
...,...,...
24338,Amandemen_1_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahanp...
24339,Amandemen_3_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
24340,Amandemen_4_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...
24341,Amandemen_2_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...


In [32]:
import pandas as pd

df_1 = pd.read_csv('core/regulatory_map.csv')
df_2 = pd.read_csv('core/turtle_map_tes.csv')

df_1['a'] = df_1['regulatory'].str.lower()
df_2['a'] = df_2['regulatory'].str.lower()
df_2['file_ttl'] = df_2['ttl_file']

df_1 = df_1.drop('regulatory', axis=1)
df_2 = df_2.drop('ttl_file', axis=1)

merged_df = pd.merge(df_2, df_1, on='a', how='left')
merged_df = merged_df.drop('a', axis=1)

merged_df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Perkab_Bantul_1994_1,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/perda/1994/perda_kabupaten_bantul...,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
1,Perkab_Bantul_1994_13,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
2,Perkab_Bantul_1994_5,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
3,Perkab_Bantul_1994_11,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
4,Perkab_Bantul_1994_7,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
...,...,...,...,...,...
24338,Amandemen_1_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahanp...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
24339,Amandemen_3_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
24340,Amandemen_4_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
24341,Amandemen_2_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...


In [33]:
filtered_merged_df = merged_df[~merged_df['file_json'].isin(error_files)]
filtered_merged_df

Unnamed: 0,regulatory,file_ttl,ttl_file,file_txt,file_json
0,Perkab_Bantul_1994_1,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/perda/1994/perda_kabupaten_bantul...,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
1,Perkab_Bantul_1994_13,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
2,Perkab_Bantul_1994_5,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
3,Perkab_Bantul_1994_11,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
4,Perkab_Bantul_1994_7,new_turtle_filesperda1994perda_kabupaten_bantu...,turtle_files/ln/1994/pp0011994.ttl,new_1_text_files/perda/1994/perda_kabupaten_ba...,new_parsed_files/perda/1994/perda_kabupaten_ba...
...,...,...,...,...,...
24338,Amandemen_1_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahanp...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
24339,Amandemen_3_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
24340,Amandemen_4_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
24341,Amandemen_2_UUD_1945,new_turtle_fileslain-lain1945uud1945perubahank...,turtle_files/bn/2015/bn270-2015.ttl,new_1_text_files/lain-lain/1945/uud1945perubah...,new_parsed_files/lain-lain/1945/uud1945perubah...
