# Normalize the UCDP/PRIO Armed Conflict dataset

In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)

In [None]:
df = pd.read_csv("../Data/UCDP_PRIO/ucdpprio_armedconflict_api.csv")
actors = pd.read_csv("../Data/UCDP_PRIO/Raw/actorlist.csv")

In [None]:
df.columns

In [None]:
df

In [None]:
# convert date columns to date datatype and drop version column
df['start_date'] = pd.to_datetime(df['start_date'], format = '%Y-%m-%d')
df['start_date2'] = pd.to_datetime(df['start_date2'], format = '%Y-%m-%d')
df['ep_end_date'] = pd.to_datetime(df['ep_end_date'], format = '%Y-%m-%d')
df = df.drop(columns=['version'])

## create observation ids

each row/observation is a conflict-year

In [None]:
df['obs_id'] = df['conflict_id'].astype(str) + '-' + df['year'].astype(str)
df

## create episode ids

Conflicts have multiple episodes when a conflict calms down to <25 deaths in a year (and therefore is not recorded as an observation) and the flares up again to >25 deaths in a year (and therefore has another observation). This can happen multiple times.

In [None]:
episode_groups = df.groupby('conflict_id').agg({'start_date2': 'nunique'})

In [None]:
def make_num_list(x):
    num_list = [1]
    start = 1
    while start < x:
        start += 1
        num_list.append(start)
    return num_list

episode_groups['num_list'] = episode_groups['start_date2'].apply(make_num_list)
episode_groups

In [None]:
episode_ids = episode_groups.num_list.apply(pd.Series).reset_index() \
    .melt(id_vars = ['conflict_id']) \
    .drop(columns=['variable']) \
    .dropna() \
    .sort_values(by=['conflict_id', 'value']) \
    .astype({'value': 'int'}) \
    .reset_index(drop=True)

episode_ids['ep_id'] = episode_ids['conflict_id'].astype(str) + "-" + episode_ids['value'].astype(str)
episode_ids = episode_ids.drop(columns=['value']).sort_values(by=['conflict_id', 'ep_id']).reset_index(drop=True)
episode_ids

In [None]:
episode_groups2 = df[['conflict_id', 'start_date2']].drop_duplicates().reset_index(drop=True)
episode_groups2['start_date2'] = pd.to_datetime(episode_groups2['start_date2'], format = '%Y-%m-%d')
episode_groups2 = episode_groups2.sort_values(by=['conflict_id', 'start_date2']).reset_index(drop=True)
episode_groups2 = episode_groups2.rename(columns={'conflict_id':'conflict_id2'})
episode_groups2

In [None]:
episode_groups3 = pd.concat([episode_ids, episode_groups2], axis=1)
episode_groups3

In [None]:
episode_groups3[episode_groups3['conflict_id'] != episode_groups3['conflict_id2']]

In [None]:
episode_groups3 = episode_groups3.drop(columns=['conflict_id2'])
episode_groups3

In [None]:
df = df.merge(episode_groups3, on=['conflict_id', 'start_date2'])
df

## Find columns that are functionally dependent on 'conflict_id' for the CONFLICT table

- 'incompatibility'
- 'territory_name'
- 'start_date'
- 'start_prec'
- 'location' (multivalued)
- 'gwno_loc' (multivalued)
- 'region' (multivalued)
- 'gwno_a' (multivalued)
- 'side_a' (multivalued)
- 'side_a_id' (multivalued)

note: 'gwno_b' corresponds to 'side_b', which is not dependent on the conflict id. However, 'gwno_b' is only present if 'side_b' is a state, and thus 'gwno_b' is often missing. It seems like it is functionally dependent on 'conflict_id', but this is misleading.

In [None]:
df.groupby('conflict_id')['incompatibility'].nunique().unique()

In [None]:
df.groupby('conflict_id')['territory_name'].nunique().unique()

In [None]:
df.groupby('conflict_id')['start_date'].nunique().unique()

In [None]:
df.groupby('conflict_id')['start_prec'].nunique().unique()

In [None]:
df.groupby('conflict_id')['location'].nunique().unique()

In [None]:
df.groupby('conflict_id')['region'].nunique().unique()

In [None]:
df.groupby('conflict_id')['type_of_conflict'].nunique().unique()

In [None]:
df.groupby('conflict_id')['gwno_loc'].nunique().unique()

In [None]:
df.groupby('conflict_id')['gwno_a'].nunique().unique()

In [None]:
df.groupby('conflict_id')['gwno_a_2nd'].nunique().unique()

In [None]:
df.groupby('conflict_id')['gwno_b'].nunique().unique()

In [None]:
df.groupby('conflict_id')['gwno_b_2nd'].nunique().unique()

In [None]:
df.groupby('conflict_id')['side_a'].nunique().unique()

In [None]:
df.groupby('conflict_id')['side_a_id'].nunique().unique()

In [None]:
df.groupby('conflict_id')['side_b'].nunique().unique()

In [None]:
df.groupby('conflict_id')['side_b_id'].nunique().unique()

create conflict end date based on last episode end date

In [None]:
ongoing = df[(df.year == 2018) & (df.ep_end == 0)].conflict_id.tolist()

In [None]:
conflict_enddates = df.groupby('conflict_id').agg({'ep_end_date':'max'})
conflict_enddates = conflict_enddates.reset_index().rename(columns={'ep_end_date':'end_date'})
conflict_enddates['end_date'] [conflict_enddates.conflict_id.isin(ongoing)] = pd.NaT
conflict_enddates

In [None]:
conflict_df = df[['conflict_id', 'start_date', 'start_prec', 'incompatibility', 'territory_name']]
conflict_df = conflict_df.drop_duplicates()
conflict_df = conflict_df.merge(conflict_enddates, on='conflict_id')
conflict_df = conflict_df.reset_index(drop=True)
conflict_df

## find columns that are functionally dependent on 'ep_id' to isolate conflict EPISODES

- 'start_date2'
- 'start_prec2'
- 'ep_end_date'
- 'ep_end_prec' (this column is blank, but it shouldn't be...)

In [None]:
df.columns

In [None]:
df.groupby('ep_id')['start_date2'].nunique().unique()

In [None]:
df.groupby('ep_id')['start_prec2'].nunique().unique()

In [None]:
df.groupby('ep_id')['ep_end_prec'].nunique().unique()

In [None]:
df.groupby('ep_id')['ep_end_date'].nunique().unique()

In [None]:
df.groupby('ep_id')['type_of_conflict'].nunique().unique()

In [None]:
df.groupby('ep_id')['cumulative_intensity'].nunique().unique()

In [None]:
df.groupby('ep_id')['intensity_level'].nunique().unique()

In [None]:
df.groupby('ep_id')['side_b'].nunique().unique()

In [None]:
episode_df = df[['ep_id', 'start_date2', 'start_prec2', 'ep_end_date', 'ep_end_prec']]
episode_df = episode_df.drop_duplicates()
episode_df

## isolate polities/actors for PARTICIPANTS table

In [None]:
df.columns

In [None]:
sideA = df[['obs_id', 'gwno_a', 'side_a', 'side_a_id']]
sideA2 = df[['obs_id', 'gwno_a_2nd', 'side_a_2nd']]
sideB = df[['obs_id', 'gwno_b', 'side_b', 'side_b_id']]
sideB2 = df[['obs_id', 'gwno_b_2nd', 'side_b_2nd']]

### side A, primary party

In [None]:
sideA_gwno_list = sideA['gwno_a'].str.split(pat=", ", expand = True)
sideA_gwno = pd.concat([sideA['obs_id'], sideA_gwno_list], axis=1)
sideA_gwno = sideA_gwno.melt(id_vars = ['obs_id']).dropna()
sideA_gwno = sideA_gwno.rename(columns={'value': 'gwno_a'})
sideA_gwno

In [None]:
sideA_text_list = sideA['side_a'].str.split(pat=", ", expand = True)
sideA_text = pd.concat([sideA['obs_id'], sideA_text_list], axis=1)
sideA_text = sideA_text.melt(id_vars = ['obs_id']).dropna()
sideA_text = sideA_text.rename(columns={'value': 'side_a'})
sideA_text

In [None]:
sideA_id_list = sideA['side_a_id'].str.split(pat=", ", expand = True)
sideA_id = pd.concat([sideA['obs_id'], sideA_id_list], axis=1)
sideA_id = sideA_id.melt(id_vars = ['obs_id']).dropna()
sideA_id = sideA_id.rename(columns={'value': 'side_a_id'})
sideA_id

In [None]:
sideA_merged = sideA_gwno.merge(sideA_text, how="outer", on=['obs_id', 'variable']).merge(sideA_id, how="outer", on=['obs_id', 'variable'])
sideA_merged['side'] = "A"
sideA_merged['role'] = "primary"
sideA_merged = sideA_merged.drop(columns = ['variable'])
sideA_merged = sideA_merged.rename(columns={'gwno_a': 'gw_id', 'side_a': 'actor_name', 'side_a_id': 'ucdp_id'})
sideA_merged

### side a, secondary party

In [None]:
sideA2_gwno_list = sideA2['gwno_a_2nd'].str.split(pat=", ", expand = True)
sideA2_gwno = pd.concat([sideA2['obs_id'], sideA2_gwno_list], axis=1)
sideA2_gwno = sideA2_gwno.melt(id_vars = ['obs_id']).dropna()
sideA2_gwno = sideA2_gwno.rename(columns={'value': 'gwno_a_2nd'})
sideA2_gwno

In [None]:
sideA2_id_list = sideA2['side_a_2nd'].str.split(pat=", ", expand = True)
sideA2_id = pd.concat([sideA2['obs_id'], sideA2_id_list], axis=1)
sideA2_id = sideA2_id.melt(id_vars = ['obs_id']).dropna()
sideA2_id = sideA2_id.rename(columns={'value': 'side_a_2nd'})
sideA2_id

In [None]:
sideA2_merged = sideA2_gwno.merge(sideA2_id, how="outer", on=['obs_id', 'variable']).merge(actors, how='left', left_on='side_a_2nd', right_on='Name')
sideA2_merged['side'] = "A"
sideA2_merged['role'] = "secondary"
sideA2_merged = sideA2_merged.drop(columns = ['variable', 'NameFull', 'Name'])
sideA2_merged = sideA2_merged.rename(columns = {'ActorID': 'ucdp_id', 'gwno_a_2nd': 'gw_id', 'side_a_2nd': 'actor_name'})
sideA2_merged

In [None]:
sideA2_merged[sideA2_merged['ucdp_id'].isna()]

### side b, primary party

In [None]:
sideB_gwno_list = sideB['gwno_b'].str.split(pat=", ", expand = True)
sideB_gwno = pd.concat([sideB['obs_id'], sideB_gwno_list], axis=1)
sideB_gwno = sideB_gwno.melt(id_vars = ['obs_id']).dropna()
sideB_gwno = sideB_gwno.rename(columns={'value': 'gwno_b'})
sideB_gwno

In [None]:
sideB_text_list = sideB['side_b'].str.split(pat=", ", expand = True)
sideB_text = pd.concat([sideB['obs_id'], sideB_text_list], axis=1)
sideB_text = sideB_text.melt(id_vars = ['obs_id']).dropna()
sideB_text = sideB_text.rename(columns={'value': 'side_b'})
sideB_text

In [None]:
sideB_id_list = sideB['side_b_id'].str.split(pat=", ", expand = True)
sideB_id = pd.concat([sideB['obs_id'], sideB_id_list], axis=1)
sideB_id = sideB_id.melt(id_vars = ['obs_id']).dropna()
sideB_id = sideB_id.rename(columns={'value': 'side_b_id'})
sideB_id

In [None]:
sideB_merged = sideB_gwno.merge(sideB_text, how="outer", on=['obs_id', 'variable']).merge(sideB_id, how="outer", on=['obs_id', 'variable'])
sideB_merged['side'] = "B"
sideB_merged['role'] = "primary"
sideB_merged = sideB_merged.drop(columns = ['variable'])
sideB_merged = sideB_merged.rename(columns = {'gwno_b': 'gw_id', 'side_b': 'actor_name', 'side_b_id': 'ucdp_id'})
sideB_merged

In [None]:
sideB_merged[sideB_merged['obs_id']=="11342-2012"]

### side b, secondary party

In [None]:
sideB2_gwno_list = sideB2['gwno_b_2nd'].str.split(pat=", ", expand = True)
sideB2_gwno = pd.concat([sideB2['obs_id'], sideB2_gwno_list], axis=1)
sideB2_gwno = sideB2_gwno.melt(id_vars = ['obs_id']).dropna()
sideB2_gwno = sideB2_gwno.rename(columns={'value': 'gwno_b_2nd'})
sideB2_gwno

In [None]:
sideB2_id_list = sideB2['side_b_2nd'].str.split(pat=", ", expand = True)
sideB2_id = pd.concat([sideB2['obs_id'], sideB2_id_list], axis=1)
sideB2_id = sideB2_id.melt(id_vars = ['obs_id']).dropna()
sideB2_id = sideB2_id.rename(columns={'value': 'side_b_2nd'})
sideB2_id

In [None]:
sideB2_merged = sideB2_gwno.merge(sideB2_id, how="outer", on=['obs_id', 'variable']).merge(actors, how='left', left_on='side_b_2nd', right_on='Name')
sideB2_merged['side'] = "B"
sideB2_merged['role'] = "secondary"
sideB2_merged = sideB2_merged.drop(columns = ['variable', 'Name', 'NameFull'])
sideB2_merged = sideB2_merged.rename(columns = {'gwno_b_2nd': 'gw_id', 'side_b_2nd': 'actor_name', 'ActorID': 'ucdp_id'})
sideB2_merged

### combine all 4 participant lists

In [None]:
all_participants = [sideA_merged, sideA2_merged, sideB_merged, sideB2_merged]
participants_df = pd.concat(all_participants).sort_values('obs_id').reset_index(drop=True)
participants_df

## seperate multivalued columns - location, region - that are dependent on CONFLICT_ID

### location

note - clearly the identifiers and the names are not in the same order, so they are not matching up correctly.

In [None]:
df.columns

In [None]:
locations = df[['conflict_id', 'location', 'gwno_loc']]
locations = locations.drop_duplicates()
locations

In [None]:
location_name_list = locations['location'].str.split(pat=", ", expand = True)
location_name = pd.concat([locations['conflict_id'], location_name_list], axis=1)
location_name = location_name.melt(id_vars = ['conflict_id']).dropna()
location_name = location_name.rename(columns={'value': 'location_name'})
location_name

In [None]:
location_id_list = locations['gwno_loc'].str.split(pat=", ", expand = True)
location_id = pd.concat([locations['conflict_id'], location_id_list], axis=1)
location_id = location_id.melt(id_vars = ['conflict_id']).dropna()
location_id = location_id.rename(columns={'value': 'gw_id'})
location_id

In [None]:
location_id2 = location_id.drop(columns=['variable'])
location_id2 = location_id2.merge(actor_mapping, how='left', left_on='gw_id', right_on='gw_id')
location_id2

In [None]:
location_df = location_name.merge(location_id, how="outer", on=['conflict_id', 'variable'])
location_df = location_df.drop(columns=['variable'])
location_df

In [None]:
location_df[location_df['gw_id'].isna()]

### region

In [None]:
df.columns

In [None]:
regions = df[['conflict_id', 'region']]
regions = regions.drop_duplicates()
regions

In [None]:
region_list = regions['region'].str.split(pat=", ", expand = True)
region_df = pd.concat([regions['conflict_id'], region_list], axis=1)
region_df = region_df.melt(id_vars = ['conflict_id']).dropna()
region_df = region_df.rename(columns={'value': 'region'})

region_dict = {"1": "Europe", "2": "Middle East", "3": "Asia", "4": "Africa", "5": "Americas"}
region_df['region'] = region_df['region'].replace(region_dict)
region_df = region_df.drop(columns=['variable'])
region_df

## Check that all columns are accounted for, put remaining variables in OBSERVATION table

In [None]:
df.columns

In [None]:
conflict_df.columns

In [None]:
episode_df.columns

In [None]:
participants_df.columns

In [None]:
location_df.columns

In [None]:
region_df.columns

In [None]:
observation_df = df[['obs_id', 'ep_id', 'conflict_id', 'year', 'intensity_level', 'cumulative_intensity', 'type_of_conflict']]
observation_df.columns

In [None]:
observation_df.cumulative_intensity.value_counts()

In [None]:
type_dict = {1: "extrasystemic", 2: "interstate", 3: "internal", 4: "internationalized internal"}
observation_df['type_of_conflict'] = observation_df['type_of_conflict'].replace(type_dict)

intensity_dict = {1: "Minor", 2: "War"}
observation_df['intensity_level'] = observation_df['intensity_level'].replace(intensity_dict)

cum_intensity_dict = {0: "Minor", 1: "War"}
observation_df['cumulative_intensity'] = observation_df['cumulative_intensity'].replace(cum_intensity_dict)

observation_df