In [8]:
import requests
import pandas as pd


## Loading the data

In [49]:
def get_data(dataset_id, sample=None):
    url = "https://data.gov.sg/api/action/datastore_search?resource_id=" + dataset_id
    params = {'offset': 0}
    dfs = []

    while True:
        response = requests.get(url, params=params).json()
        df = pd.DataFrame(response['result']['records'])
        dfs.append(df)
        if response['result']['_links']['next'] is None or (sample is not None and len(dfs) * 100 >= sample):
            break
        params['offset'] += 100  # assuming 100 records per page

    full_df = pd.concat(dfs, ignore_index=True)
    if sample is not None:
        full_df = full_df.head(sample)
    full_df = full_df.drop(['_id'], axis=1)
    return full_df

In [50]:
#age with unemployment duration
dataset_id_1 = "d_db95e15ceffaa368a043310479dc7d57"
data_1 = get_data(dataset_id_1, 2000)

#highest education with unemployment duration
dataset_id_2 = "d_a0ca632fd1d6ff841f0e47298a9ab589"
data_2 = get_data(dataset_id_2, 2000)

#median duration of unemployment
dataset_id_3 = "d_c01a3210fb10f1a52676f97498d4ec2c"
data_3 = get_data(dataset_id_3, 2000)




In [51]:
data_1.head()

Unnamed: 0,year,sex,age,duration,unemployed
0,2010,male,15-24,under 5,4700
1,2010,male,15-24,5 to 9,1700
2,2010,male,15-24,10 to 14,700
3,2010,male,15-24,15 to 19,200
4,2010,male,15-24,20 to 24,500


In [52]:
data_2.head()

Unnamed: 0,year,sex,highest_qualification,duration,unemployed
0,2010,male,primary and below,under 5,2600
1,2010,male,primary and below,5 to 9,900
2,2010,male,primary and below,10 to 14,900
3,2010,male,primary and below,15 to 19,500
4,2010,male,primary and below,20 to 24,1300


In [53]:
data_3.head()

Unnamed: 0,year,median_dur_of_unemp
0,1991,8
1,1992,6
2,1993,6
3,1994,4
4,1996,4


In [54]:
print(data_1["year"].unique())
print(data_2["year"].unique())
print(data_3["year"].unique())

['2010' '2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019'
 '2020' '2021' '2022' '2023']
['2010' '2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018' '2019'
 '2020' '2021' '2022' '2023']
['1991' '1992' '1993' '1994' '1996' '1997' '1998' '1999' '2001' '2002'
 '2003' '2004' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013'
 '2014' '2015' '2016' '2017' '2018' '2019' '2020' '2021' '2022' '2023']


## Merging data

In [55]:
merged_data = pd.merge(data_1, data_2, on=['year', 'duration'])
merged_data = pd.merge(merged_data, data_3, on='year', how='inner')

## Preprocessing the data

In [56]:
## getting duration midpoint
def create_duration_midpoints(df):
    midpoints = {}
    for category in df["duration"].unique():
        if category == "under 5":
            midpoint = 2.5
        elif category == "52 and over":
            midpoint = 52 + (104 - 52) / 2  # assuming the upper limit is 104
        else:
            lower, upper = map(int, category.split(" to "))
            midpoint = (lower + upper) / 2
        midpoints[category] = midpoint
    return midpoints




In [57]:
midpoint_dict = create_duration_midpoints(merged_data)
merged_data['duration_midpoint'] = merged_data['duration'].map(midpoint_dict)
merged_data['censored'] = merged_data['duration'].map(lambda x: 1 if x == "52 and over" else 0)
merged_data.head()

Unnamed: 0,year,sex_x,age,duration,unemployed_x,sex_y,highest_qualification,unemployed_y,median_dur_of_unemp,duration_midpoint,censored
0,2010,male,15-24,under 5,4700,male,primary and below,2600,8,2.5,0
1,2010,male,15-24,under 5,4700,male,lower secondary,2200,8,2.5,0
2,2010,male,15-24,under 5,4700,male,secondary,2600,8,2.5,0
3,2010,male,15-24,under 5,4700,male,post-secondary (non-tertiary),3000,8,2.5,0
4,2010,male,15-24,under 5,4700,male,diploma and professional qualification,3200,8,2.5,0
