In [1]:
import pandas as pd
import numpy as np

from tqdm._tqdm_notebook import tqdm_notebook
# If error -> pip install tqdm

#### 1. Data import

In [2]:
conflict_df = pd.read_csv('datasets/ucdp-prio-acd-171.csv')
conflict_df.head()

Unnamed: 0,conflictid,location,sidea,sidea2nd,side b,sidebid,sideb2nd,incomp,terr,year,...,epend,ependdate,ependprec,gwnoa,gwnoa2nd,gwnob,gwnob2nd,gwnoloc,region,version
0,200,Bolivia,Government of Bolivia,,Popular Revolutionary Movement,719,,2,,1946,...,1,1946-07-21,1.0,145,,,,145,5,17.1
1,200,Bolivia,Government of Bolivia,,MNR,720,,2,,1952,...,1,1952-04-12,1.0,145,,,,145,5,17.1
2,200,Bolivia,Government of Bolivia,,ELN,721,,2,,1967,...,1,1967-10-16,1.0,145,,,,145,5,17.1
3,201,Cambodia (Kampuchea),Government of France,,Khmer Issarak,160,,1,Cambodia,1946,...,0,,,220,,,,811,3,17.1
4,201,Cambodia (Kampuchea),Government of France,,Khmer Issarak,160,,1,Cambodia,1947,...,0,,,220,,,,811,3,17.1


#### 2. Dataframe process

- remove useless columns
- remove all "Government of..."
- remove text between parantheses
- complete/correct "terr" columns

In [3]:
# Remove useless columns
columns_to_drop = ['conflictid', 'sidebid', 'incomp', 'terr', 'startprec', 'sidea2nd',
                   'startprec2', 'epend', 'ependprec', 'gwnoa', 'gwnoa2nd', 'sideb2nd',
                   'gwnob', 'gwnob2nd', 'gwnoloc', 'version']

conflict_df.drop(columns_to_drop, axis=1, inplace=True)

# Remove columns
conflict_df.replace({'Government of ': ''}, regex=True, inplace=True)
# Remove text between parentheses 
conflict_df.location.replace({r'\([^()]+\)' : ''}, regex=True, inplace=True)
# Remove South/North from Location (ex: South Yemen or South Korea)
conflict_df.location.replace({'South ' : ''}, regex=True, inplace=True)
conflict_df.location.replace({'North ' : ''}, regex=True, inplace=True)
# But we keep South Africa
conflict_df.location.replace({'Africa' : 'South Africa'}, regex=True, inplace=True)
conflict_df.location.replace({'Central South African Republic' : 'Central African Republic'}, regex=True, inplace=True)

conflict_df.location.replace({'Congo' : 'DR Congo'}, regex=True, inplace=True)
conflict_df.location.replace({'DR DR' : 'DR'}, regex=True, inplace=True)


conflict_df.head()

Unnamed: 0,location,sidea,side b,year,intensity,cumint,type,startdate,startdate2,ependdate,region
0,Bolivia,Bolivia,Popular Revolutionary Movement,1946,2,1,3,1946-07-18,1946-07-21,1946-07-21,5
1,Bolivia,Bolivia,MNR,1952,1,1,3,1946-07-18,1952-04-09,1952-04-12,5
2,Bolivia,Bolivia,ELN,1967,1,1,3,1946-07-18,1967-03-31,1967-10-16,5
3,Cambodia,France,Khmer Issarak,1946,1,0,1,1946-08-31,1946-08-31,,3
4,Cambodia,France,Khmer Issarak,1947,1,0,1,1946-08-31,1946-08-31,,3


In [4]:
columns = ["location", "sidea", "side b", "year", "intensity", "cumint", "type", 
           "startdate", "startdate2", "ependdate", "region"]

conflict_df_tmp = pd.DataFrame(columns=columns)

for index in range(len(conflict_df)):

    locations = conflict_df.loc[index, 'location'].split(",")
    regions = conflict_df.loc[index, 'region'].split(",")
    
    if len(regions) > 1:
        
        pairs = [(locations[i], regions[i]) for i in range(len(locations))]

        for i, pair in enumerate(pairs):

            conflict_row_df = conflict_df.loc[index:index,]
            
            if i > 0: 
                conflict_row_df.set_value(index, 'location', pair[0][1:])
                conflict_row_df.set_value(index, 'region', pair[1])
                #conflict_row_df.loc[index, 'location'] = pair[0][1:]
                #conflict_row_df.loc[index, 'region']   = pair[1]
            else: 
                conflict_row_df.set_value(index, 'location', pair[0])
                conflict_row_df.set_value(index, 'region', pair[1])
                
            # Add the new line to the df
            conflict_df_tmp = conflict_df_tmp.append(conflict_row_df)
        
        # Remove initial line
        conflict_df.drop(index, inplace=True)#.reset_index(drop=True)
                  
    elif len(locations) > 1:
        conflict_df.loc[index, 'location'] = locations[0]
                
            
#test      
conflict_df = pd.concat([conflict_df,conflict_df_tmp]).reset_index(drop=True)
conflict_df = conflict_df[columns]

#### 3. Keep ex-colonies


In [5]:
colonized_df = pd.read_csv('datasets/colonies_wikipedia.csv')
colonized_countries = list(colonized_df["Colonized Country"])
colonized_df.head()

Unnamed: 0.1,Unnamed: 0,Colonized Country,ID,Day,Month,Year,Colonizer Country,URL
0,1.0,Iceland,IS,17.0,6.0,1944.0,Denmark,/wiki/Iceland
1,2.0,Canada,CA,1.0,7.0,1867.0,United Kingdom,/wiki/Canada
2,3.0,United States,US,4.0,7.0,1776.0,France,/wiki/United_States
3,4.0,Haiti,HT,1.0,1.0,1804.0,France,/wiki/Haiti
4,5.0,Benin,BJ,1.0,8.0,1960.0,France,/wiki/Benin


In [6]:
columns = ["location", "ID", "sidea", "side b", "year", "intensity", "cumint", "type", 
           "startdate", "startdate2", "ependdate", "region"]

pre_colonization_conflict_df = pd.DataFrame(columns=columns)
post_colonization_conflict_df = pd.DataFrame(columns=columns)

for index in tqdm_notebook(range(len(conflict_df))):
    country = conflict_df.loc[index, 'location']
    
    if country[-1] == ' ': country = country[:-1]
    
    if country in colonized_countries or country + " " in colonized_countries:
        indep_year = int(colonized_df[colonized_df['Colonized Country']==country]['Year'])
        start_year = int(conflict_df.loc[index, 'startdate'].split("-")[0])

        conflict_df_tmp = conflict_df.loc[index:index,]
        ID = list(colonized_df[colonized_df['Colonized Country']==country]['ID'])[0]
        conflict_df_tmp.set_value(index, 'ID', ID)
        
        if start_year > indep_year:
            post_colonization_conflict_df = post_colonization_conflict_df.append(conflict_df_tmp)
        else:
            pre_colonization_conflict_df = pre_colonization_conflict_df.append(conflict_df_tmp)
    #else:
    #    print(country, "--> NOT EX COLONY")
        
post_colonization_conflict_df.reset_index(drop=True)
post_colonization_conflict_df = post_colonization_conflict_df[columns]
post_colonization_conflict_df.to_csv("datasets/post_colonization_conflict.csv")

pre_colonization_conflict_df.reset_index(drop=True)
pre_colonization_conflict_df = pre_colonization_conflict_df[columns]
post_colonization_conflict_df.to_csv("datasets/pre_colonization_conflict.csv")

Widget Javascript not detected.  It may not be installed or enabled properly.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s





In [7]:
print("Conflict pre coloinzation :", len(pre_colonization_conflict_df))
print("Conflict post coloinzation :", len(post_colonization_conflict_df))

Conflict pre coloinzation : 374
Conflict post coloinzation : 1405
