In [169]:
# Use ML to predict whether or not a given constitution was written by a former UK colony.
# Inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2438983
# (e.g., idea to code countries as 1=former UK colony, 0=otherwise)
# Data on colonization from http://www.cepii.fr/PDF_PUB/wp/2011/wp2011-25.pdf
# Corpus of constitutions from https://www.poltext.org/en/constitutional-texts

# Data Cleaning

I'm basically starting from scratch here, just so that it's completely transparent (I'm not sweeping anything under the rug)

In [171]:
import os
import pandas as pd

In [172]:
const_files = sorted(os.listdir('constitutions'))

In [174]:
full_colonial_df = pd.read_stata('geo_cepii.dta')

In [175]:
full_colonial_df.head()

Unnamed: 0,iso2,iso3,cnum,country,pays,area,dis_int,landlocked,continent,city_en,...,lang9_2,lang9_3,lang9_4,colonizer1,colonizer2,colonizer3,colonizer4,short_colonizer1,short_colonizer2,short_colonizer3
0,AD,AND,20,Andorra,Andorre,453,8.005398,0.0,Europe,Andorra la Vella,...,,,,,,,,,,
1,AE,ARE,784,United Arab Emirates,Emirats arabes unis,83657,108.788994,0.0,Asia,Abu Dhabi,...,,,,GBR,,,,,,
2,AF,AFG,4,Afghanistan,Afghanistan,652225,303.761353,1.0,Asia,Kabul,...,Uzbek,,,,,,,GBR,,
3,AG,ATG,28,Antigua and Barbuda,Antigua-et-Barbuda,442,7.907605,0.0,America,Saint John's,...,,,,GBR,,,,,,
4,AI,AIA,660,Anguilla,Anguilla,102,3.79869,0.0,America,The Valley,...,,,,GBR,,,,,,


In [16]:
colonial_df = full_colonial_df[["country","pays","colonizer1"]]

In [18]:
colonial_df.head()

Unnamed: 0,country,pays,colonizer1
0,Andorra,Andorre,
1,United Arab Emirates,Emirats arabes unis,GBR
2,Afghanistan,Afghanistan,
3,Antigua and Barbuda,Antigua-et-Barbuda,GBR
4,Anguilla,Anguilla,GBR


In [49]:
# Now we try to match the files to countries in the dataset

In [50]:
file_df = pd.DataFrame(const_files, columns=["filename"])

In [51]:
file_df.head()

Unnamed: 0,filename
0,afghanistan2004.txt
1,albanie1998-2008.txt
2,algerie1989-2008.txt
3,allemagne1949-2010.txt
4,andorre1993.txt


In [52]:
# Remove the ".txt"
file_df["file_country"] = file_df["filename"].str.replace(".txt","")
# Replace "-" with " "
file_df["file_country"] = file_df["file_country"].str.replace("-"," ")
# Replace "_" with " "
file_df["file_country"] = file_df["file_country"].str.replace("_"," ")
# Remove numbers
file_df["file_country"] = file_df["file_country"].str.replace("\d","")
# Remove trailing whitespace
file_df["file_country"] = file_df["file_country"].str.strip()

In [53]:
file_series

Unnamed: 0,filename,file_country
0,afghanistan2004.txt,afghanistan
1,albanie1998-2008.txt,albanie
2,algerie1989-2008.txt,algerie
3,allemagne1949-2010.txt,allemagne
4,andorre1993.txt,andorre
5,angola2010.txt,angola
6,antigua-et-barbuda1981.txt,antigua et barbuda
7,arabie-saoudite1992.txt,arabie saoudite
8,argentine1853-1994.txt,argentine
9,armenie1995-2005.txt,armenie


In [54]:
# Now merge

In [59]:
# First we should lowercase the country name
colonial_df["pays"] = colonial_df["pays"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [66]:
merged_df = colonial_df.merge(file_df, left_on="pays", right_on="file_country")

In [67]:
# Remove duplicates
merged_df = merged_df.drop_duplicates(subset="country")

In [68]:
merged_df

Unnamed: 0,country,pays,colonizer1,filename,file_country
0,Andorra,andorre,,andorre1993.txt,andorre
1,United Arab Emirates,emirats arabes unis,GBR,emirats-arabes-unis1971-1972.txt,emirats arabes unis
2,Afghanistan,afghanistan,,afghanistan2004.txt,afghanistan
3,Albania,albanie,TUR,albanie1998-2008.txt,albanie
4,Angola,angola,PRT,angola2010.txt,angola
5,Argentina,argentine,ESP,argentine1853-1994.txt,argentine
6,Austria,autriche,,autriche1920.txt,autriche
7,Australia,australie,GBR,australie1900-1977.txt,australie
9,Barbados,barbade,GBR,barbade1966-2007.txt,barbade
10,Bangladesh,bangladesh,GBR,bangladesh1972-2011.txt,bangladesh


In [71]:
# Finally, make a 0/1 variable for whether or not they were a UK colony
merged_df["uk_col"] = (merged_df["colonizer1"] == "GBR").astype(int)

In [72]:
merged_df.head()

Unnamed: 0,country,pays,colonizer1,filename,file_country,uk_col
0,Andorra,andorre,,andorre1993.txt,andorre,0
1,United Arab Emirates,emirats arabes unis,GBR,emirats-arabes-unis1971-1972.txt,emirats arabes unis,1
2,Afghanistan,afghanistan,,afghanistan2004.txt,afghanistan,0
3,Albania,albanie,TUR,albanie1998-2008.txt,albanie,0
4,Angola,angola,PRT,angola2010.txt,angola,0


# Machine Learn!

In [78]:
import codecs

In [102]:
# Construct the list of texts
file_list = merged_df["filename"].tolist()

In [103]:
text_list = []
for cur_filename in file_list:
    print("Loading " + cur_filename)
    cur_filepath = os.path.join("constitutions",cur_filename)
    with codecs.open(cur_filepath, "r", "utf-8", errors="ignore") as f:
        cur_text = f.read().replace("\n"," ").replace("\r"," ")
        text_list.append(cur_text)

Loading andorre1993.txt
Loading emirats-arabes-unis1971-1972.txt
Loading afghanistan2004.txt
Loading albanie1998-2008.txt
Loading angola2010.txt
Loading argentine1853-1994.txt
Loading autriche1920.txt
Loading australie1900-1977.txt
Loading barbade1966-2007.txt
Loading bangladesh1972-2011.txt
Loading bulgarie1991-2007.txt
Loading burundi2005.txt
Loading bolivie2009.txt
Loading bahamas1973.txt
Loading bhoutan2008.txt
Loading botswana_1996.txt
Loading belize1981-2010.txt
Loading canada1867-1982.txt
Loading suisse1999-2011.txt
Loading chili1980-2012.txt
Loading cameroun1972-1996.txt
Loading chine1982-2004.txt
Loading colombie1991-2011.txt
Loading cuba1976-2003.txt
Loading chypre1960-1996.txt
Loading allemagne1949-2010.txt
Loading djibouti1992-2010.txt
Loading danemark1953.txt
Loading dominique1978-1984.txt
Loading equateur2008.txt
Loading estonie1992-2007.txt
Loading egypte2011.txt
Loading espagne1978-2011.txt
Loading ethiopie1995.txt
Loading finlande1999-2011.txt
Loading fidji1990-1997.tx

In [176]:
text_list[1][:500]

'United Arab Emirates  THE PROVISIONAL CONSTITUTION OF THE UNITED ARAB EMIRATES July 18, 1971, amended 1972  We, the Rulers of the Emirates of Abu Dhabi, Dubai, Sharjah, Ajman, Umm Al Qawain and Fujairah;  Whereas it is out desire and the desire of the people of our Emirates to establish a Union between these Emirates, to promote a better life, more enduring stability and a higher international status for the Emirates and their people;  Desiring to create closer links between the Arab Emirates in'

In [105]:
# Now make each text a cell within the DataFrame
merged_df["const_text"] = text_list

In [106]:
merged_df.head()

Unnamed: 0,country,pays,colonizer1,filename,file_country,uk_col,const_text
0,Andorra,andorre,,andorre1993.txt,andorre,0,Constitution of the Principality of Andorra ...
1,United Arab Emirates,emirats arabes unis,GBR,emirats-arabes-unis1971-1972.txt,emirats arabes unis,1,United Arab Emirates THE PROVISIONAL CONSTITU...
2,Afghanistan,afghanistan,,afghanistan2004.txt,afghanistan,0,"The Constitution of Afghanistan January 3,..."
3,Albania,albanie,TUR,albanie1998-2008.txt,albanie,0,"CONSTITUTION OF ALBANIA We, the people of ..."
4,Angola,angola,PRT,angola2010.txt,angola,0,REPUBLIC OF ANGOLA NATIONAL ASSEMBLY CON...


In [177]:
# Compute *hand-engineered* features.
# Begin by cleaning data
from gensim.parsing.preprocessing import preprocess_string
merged_df["const_preproc"] = merged_df["const_text"].apply(preprocess_string)

In [178]:
merged_df["const_preproc"] = merged_df["const_preproc"].apply(lambda x: " ".join(x))

In [116]:
merged_df.head()

Unnamed: 0,country,pays,colonizer1,filename,file_country,uk_col,const_text,const_preproc
0,Andorra,andorre,,andorre1993.txt,andorre,0,Constitution of the Principality of Andorra ...,constitut princip andorra consel gener princip...
1,United Arab Emirates,emirats arabes unis,GBR,emirats-arabes-unis1971-1972.txt,emirats arabes unis,1,United Arab Emirates THE PROVISIONAL CONSTITU...,unit arab emir provision constitut unit arab e...
2,Afghanistan,afghanistan,,afghanistan2004.txt,afghanistan,0,"The Constitution of Afghanistan January 3,...",constitut afghanistan januari god graciou merc...
3,Albania,albanie,TUR,albanie1998-2008.txt,albanie,0,"CONSTITUTION OF ALBANIA We, the people of ...",constitut albania peopl albania proud awar his...
4,Angola,angola,PRT,angola2010.txt,angola,0,REPUBLIC OF ANGOLA NATIONAL ASSEMBLY CON...,republ angola nation assembl constitu assembl ...


In [117]:
# Here are the "hand-engineered" features
merged_df["const_len"] = merged_df["const_text"].apply(len)

In [125]:
merged_df["num_free"] = merged_df["const_text"].str.count("free")
merged_df["num_just"] = merged_df["const_text"].str.count("just")
merged_df["num_lib"] = merged_df["const_text"].str.count("liber")
merged_df["prop_free"] = merged_df["num_free"] / merged_df["const_len"]
merged_df["prop_just"] = merged_df["num_just"] / merged_df["const_len"]
merged_df["prop_lib"] = merged_df["num_lib"] / merged_df["const_len"]

In [126]:
merged_df.head()

Unnamed: 0,country,pays,colonizer1,filename,file_country,uk_col,const_text,const_preproc,const_len,num_freedom,num_free,num_just,num_lib,prop_free,prop_just,prop_lib
0,Andorra,andorre,,andorre1993.txt,andorre,0,Constitution of the Principality of Andorra ...,constitut princip andorra consel gener princip...,62244,28,28,11,5,0.00045,0.000177,8e-05
1,United Arab Emirates,emirats arabes unis,GBR,emirats-arabes-unis1971-1972.txt,emirats arabes unis,1,United Arab Emirates THE PROVISIONAL CONSTITU...,unit arab emir provision constitut unit arab e...,67760,9,9,5,7,0.000133,7.4e-05,0.000103
2,Afghanistan,afghanistan,,afghanistan2004.txt,afghanistan,0,"The Constitution of Afghanistan January 3,...",constitut afghanistan januari god graciou merc...,69973,14,14,9,3,0.0002,0.000129,4.3e-05
3,Albania,albanie,TUR,albanie1998-2008.txt,albanie,0,"CONSTITUTION OF ALBANIA We, the people of ...",constitut albania peopl albania proud awar his...,87562,42,42,9,9,0.00048,0.000103,0.000103
4,Angola,angola,PRT,angola2010.txt,angola,0,REPUBLIC OF ANGOLA NATIONAL ASSEMBLY CON...,republ angola nation assembl constitu assembl ...,180777,80,80,37,8,0.000443,0.000205,4.4e-05


In [144]:
# Split the data into training and test data [by splitting the indices]
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)

In [145]:
print("Training size: " + str(len(train_df)))
print("Test size: " + str(len(test_df)))

Training size: 96
Test size: 24


In [146]:
feature_vars = ["const_len","prop_free","prop_just","prop_lib"]
label_var = "uk_col"
train_features = train_df[feature_vars]
train_labels = train_df[label_var]
test_features = test_df[feature_vars]
test_labels = test_df[label_var].tolist()

In [154]:
from sklearn.naive_bayes import GaussianNB
def naiveBayesClassify(train_features, train_labels, test_features, test_labels):
    gnb = GaussianNB()
    y_pred = gnb.fit(train_features, train_labels).predict(test_features)
    print("Predicted labels: " + str(y_pred))
    print("Actual test labels: " + str(test_labels))
    num_test_obs = len(test_df)
    mislabeled_obs = (test_labels != y_pred).sum()
    accuracy = 1 - (mislabeled_obs/num_test_obs)
    print(str(mislabeled_obs) + " mislabeled obs out of " + str(num_test_obs) + " total test observations")
    print("Accuracy = " + str(accuracy))
naiveBayesClassify(train_features, train_labels, test_features, test_labels)

Predicted labels: [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
Actual test labels: [0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
7 mislabeled obs out of 24 total test observations
Accuracy = 0.708333333333


In [156]:
# Now we try fancier Ngram features
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

In [164]:
train_counts = count_vect.fit_transform(train_df["const_preproc"]).toarray()
test_counts = count_vect.transform(test_df["const_preproc"]).toarray()

In [165]:
print(train_counts.shape)
print(test_counts.shape)

(96, 17177)
(24, 17177)


In [166]:
naiveBayesClassify(train_counts, train_labels, test_counts, test_labels)

Predicted labels: [0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0]
Actual test labels: [0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]
3 mislabeled obs out of 24 total test observations
Accuracy = 0.875


In [167]:
import numpy as np
# Baseline 1: random guessing
random_test_labels = np.random.choice([0,1], size=(len(test_df)))

In [168]:
mislabeled_random = (random_test_labels != y_pred).sum()
accuracy = 1 - (mislabeled_random/num_test_obs)

array([1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0])