In [1]:
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Data Exploration using Cambodia dataset

In [3]:
cambodia_individual = pd.read_stata("data/Cambodia_individual.DTA")

In [4]:
cambodia_individual.shape

(17578, 4397)

In [5]:
variables_code = ["caseid","v101","v025", "d104", "d106", "d107", "d108",
              "v133","v190", "v501", "v502","v731", "v741", "v012", 
              "v745a","v745b", "v746", "v715", "v136", "v201", "v151", "v536", "v613", "v621", "v739"]

In [6]:
cambodia_individual = cambodia_individual[variables_code]

In [7]:
print(cambodia_individual.shape)

(17578, 25)


In [8]:
cambodia_individual['v502'].unique()

['currently in union/living with a man', 'never in union', 'formerly in union/living with a man']
Categories (3, object): ['never in union' < 'currently in union/living with a man' < 'formerly in union/living with a man']

# Data Cleaning

In [9]:
# Filter: only include data with women who has been in marriage/union
cambodia_individual['if_union'] = None
cambodia_individual.loc[(cambodia_individual['v502'] == 1) | (cambodia_individual['v501'] == 'married') | (cambodia_individual['v502'].str.contains("currently")),
'if_union'] = 1

In [10]:
cambodia_individual_filtered = cambodia_individual[cambodia_individual['if_union'] == 1]
print(cambodia_individual_filtered.shape)

(11668, 26)


In [11]:
kept_variable = ['province','age','education','wealth_index','partner_edu']

In [12]:
cambodia_individual_filtered['country'] = 'Cambodia'

In [13]:
cambodia_individual_filtered['year'] = '2014'

In [14]:
rename_col = {
    'v101': 'province',
    'v012': 'age',
    'v133': 'education',
    'v190': 'wealth_index',
    'v715': 'partner_edu',
    'v745a': 'house_ownership',
    'v745b': 'land_owenership',
    'v731': 'if_employment',
    'v741': 'employment_pay_method',
    "v136": 'num_household', 
    "v201": 'num_child',
    "v151": 'sex_head_household', 
    "v536": 'sexual_activity', 
    "v613": 'ideal_num_child', 
    "v621": 'partner_ideal_child', 
    "v739": 'money_decide_person'
}

In [15]:
cambodia_individual_filtered.rename(columns=rename_col, inplace=True)

In [16]:
# Check NA values
cambodia_individual_filtered.isna().sum()

caseid                      0
province                    0
v025                        0
d104                     8445
d106                     8445
d107                     8445
d108                     8445
education                   0
wealth_index                0
v501                        0
v502                        0
if_employment               1
employment_pay_method    2101
age                         0
house_ownership             0
land_owenership             1
v746                     2810
partner_edu                 5
num_household               0
num_child                   0
sex_head_household          0
sexual_activity             2
ideal_num_child             0
partner_ideal_child       391
money_decide_person      2810
if_union                    0
country                     0
year                        0
dtype: int64

In [17]:
# Target: if_emo_vio, if has emotional violence
cambodia_individual_filtered['if_emo_vio'] = None
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d104'] == 'yes'),'if_emo_vio'] = 1
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d104'] == 'no'),'if_emo_vio'] = 0
cambodia_individual_filtered.groupby('if_emo_vio').caseid.nunique()

if_emo_vio
0    2530
1     693
Name: caseid, dtype: int64

In [18]:
# Target: if_phy_vio, if has physical violence

cambodia_individual_filtered['if_phy_vio'] = None
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d106'] == 'yes') | (cambodia_individual_filtered['d107'] == 'yes'),'if_phy_vio'] = 1
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d106'] == 'no') & (cambodia_individual_filtered['d107'] == 'no'),'if_phy_vio'] = 0
cambodia_individual_filtered.groupby('if_phy_vio').caseid.nunique()

if_phy_vio
0    2766
1     457
Name: caseid, dtype: int64

In [19]:
# Target: if_phy_vio_severe, if has severe physical violence

cambodia_individual_filtered['if_phy_vio_severe'] = None
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d107'] == 'yes'),'if_phy_vio_severe'] = 1
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d107'] == 'no'),'if_phy_vio_severe'] = 0
cambodia_individual_filtered.groupby('if_phy_vio_severe').caseid.nunique()

if_phy_vio_severe
0    3034
1     189
Name: caseid, dtype: int64

In [20]:
# Target: if_sex_vio, if has sexual violence

cambodia_individual_filtered['if_sex_vio'] = None
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d108'] == 'yes'),'if_sex_vio'] = 1
cambodia_individual_filtered.loc[(cambodia_individual_filtered['d108'] == 'no'),'if_sex_vio'] = 0
cambodia_individual_filtered.groupby('if_sex_vio').caseid.nunique()

if_sex_vio
0    3092
1     131
Name: caseid, dtype: int64

In [21]:
# Target: num_vio, number of violence kinds the woman has

cambodia_individual_filtered['num_vio'] = None
cambodia_individual_filtered['num_vio'] = cambodia_individual_filtered['if_emo_vio'] + cambodia_individual_filtered['if_phy_vio'] + cambodia_individual_filtered['if_sex_vio']

cambodia_individual_filtered.groupby('num_vio').caseid.nunique()

num_vio
0    2406
1     429
2     312
3      76
Name: caseid, dtype: int64

In [22]:
# Target: if_vio, if has any of the three kinds of violence

cambodia_individual_filtered['if_vio'] = None
cambodia_individual_filtered.loc[(cambodia_individual_filtered['num_vio'] > 0),'if_vio'] = 1
cambodia_individual_filtered.loc[(cambodia_individual_filtered['num_vio'] == 0),'if_vio'] = 0

cambodia_individual_filtered.groupby('if_vio').caseid.nunique()

if_vio
0    2406
1     817
Name: caseid, dtype: int64

In [23]:
# Features: Wealth_index_code

wealth_index_dict = {'poorest': 0,
                    'poorer': 1,
                    'middle': 2,
                    'richer': 3,
                    'richest': 4}
cambodia_individual_filtered['wealth_index_code'] = cambodia_individual_filtered['wealth_index'].replace(wealth_index_dict, inplace=False)

In [24]:
# Features: if_own_house

cambodia_individual_filtered['if_own_house'] = 1
cambodia_individual_filtered.loc[(cambodia_individual_filtered['house_ownership'] == 'does not own'),'if_own_house'] = 0

cambodia_individual_filtered.groupby('if_own_house').caseid.nunique()

if_own_house
0    2785
1    8883
Name: caseid, dtype: int64

In [25]:
# Features: if_own_land

cambodia_individual_filtered['if_own_land'] = 1
cambodia_individual_filtered.loc[(cambodia_individual_filtered['land_owenership'] == 'does not own'),'if_own_land'] = 0

cambodia_individual_filtered.groupby('if_own_land').caseid.nunique()

if_own_land
0    3888
1    7780
Name: caseid, dtype: int64

In [26]:
# Features: if_employment_current

cambodia_individual_filtered['if_employment_current'] = 0
cambodia_individual_filtered.loc[(cambodia_individual_filtered['if_employment'] == 'have a job, but on leave last 7 days')
                                 | (cambodia_individual_filtered['if_employment'] == 'currently working'), 'if_employment_current'] = 1

cambodia_individual_filtered.groupby('if_employment_current').caseid.nunique()

if_employment_current
0    3198
1    8470
Name: caseid, dtype: int64

In [27]:
cambodia_individual_filtered.columns

Index(['caseid', 'province', 'v025', 'd104', 'd106', 'd107', 'd108',
       'education', 'wealth_index', 'v501', 'v502', 'if_employment',
       'employment_pay_method', 'age', 'house_ownership', 'land_owenership',
       'v746', 'partner_edu', 'num_household', 'num_child',
       'sex_head_household', 'sexual_activity', 'ideal_num_child',
       'partner_ideal_child', 'money_decide_person', 'if_union', 'country',
       'year', 'if_emo_vio', 'if_phy_vio', 'if_phy_vio_severe', 'if_sex_vio',
       'num_vio', 'if_vio', 'wealth_index_code', 'if_own_house', 'if_own_land',
       'if_employment_current'],
      dtype='object')

In [28]:
kept_variable = ['caseid', 'country', 'year',
                 'province','age','education', 'if_union',
                 'wealth_index','wealth_index_code', 'house_ownership', 'land_owenership', 'if_own_house', 'if_own_land',
                 'if_employment', 'if_employment_current','employment_pay_method',
                 'partner_edu', 
                 'num_household', 'num_child','sex_head_household', 'sexual_activity', 'ideal_num_child', 'partner_ideal_child', 'money_decide_person',
                'if_emo_vio', 'if_phy_vio', 'if_phy_vio_severe', 'if_sex_vio', 'if_vio', 'num_vio']

In [29]:
output_df = cambodia_individual_filtered[kept_variable]

In [30]:
output_path = "cambodia_2014_cleaned.csv"

In [31]:
output_df.to_csv(output_path)