In [1]:
import os
import re
import csv
import sys
import random
import ast
from time import sleep
from pandas import DataFrame, to_numeric
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics, tree, cross_validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RandomizedLogisticRegression

csv.field_size_limit(sys.maxsize)

131072

## Load and PreProcess

In [2]:
# read in full csv
recs = []
with open('data/upr-info-issues.csv','r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        recs.append(row)
len(recs) # 41066 

41066

In [3]:
# remove general / other recs
bad = []
for i, rec in enumerate(recs):
    for key in ['General', 'Other']:
        if rec[key] == '1':
            bad.append(i)
        del rec[key]
len(bad)

1611

In [4]:
recs_sub = [i for j, i in enumerate(recs) if j not in bad]
len(recs_sub)

39455

In [5]:
# turn to dataframe
data = DataFrame(recs_sub)
data.shape

(39455, 53)

In [7]:
data

Unnamed: 0,Asylum.seekers...refugees,CP.rights...general,Civil.society,Corruption,Counter.terrorism,Death.penalty,Detention,Development,Disabilities,ESC.rights...general,...,Rights.of.the.Child,Sexual.Orientation.and.Gender.Identity,Special.procedures,Technical.assistance.and.cooperation,Text,Torture.and.other.CID.treatment,Trafficking,Treaty.bodies,UPR.process,Women.s.rights
0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,Consider the possibility of acceding to the In...,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,Establish a national institution to promote an...,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,Consider the possibility of establishing a nat...,0,0,0,0,0
3,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,Consider the possibility of ratifying the Inte...,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"Continue with the efforts to prevent, punish a...",0,0,0,0,1
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,Accede to the Second Optional Protocol to the ...,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,Improve conditions in Antigua and Barbuda's pr...,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,Consider abolishing the death penalty,0,0,0,0,0
8,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,Consider acceding to the International Covenan...,1,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,Consider taking necessary measures aimed at pr...,1,0,0,0,0


In [6]:
# get text and target
text = data['Text'].values
target = data.drop('Text', 1) # drop text
target = target.apply(lambda x: to_numeric(x, errors='ignore')) # turn to numeric
target = np.array(target) # turn into numpy array

In [8]:
target

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
# get training + test data
train_text, test_text, train_target, test_target = cross_validation.train_test_split(
    text, target, test_size=0.2, random_state=40)
len(train_text)

31564

In [14]:
# get target (label) names
target_names = list(data.drop('Text', 1).columns.values)
target_names

['Asylum.seekers...refugees',
 'CP.rights...general',
 'Civil.society',
 'Corruption',
 'Counter.terrorism',
 'Death.penalty',
 'Detention',
 'Development',
 'Disabilities',
 'ESC.rights...general',
 'Elections',
 'Enforced.disappearances',
 'Environment',
 'Extrajudicial.executions',
 'Freedom.of.association.and.peaceful.assembly',
 'Freedom.of.movement',
 'Freedom.of.opinion.and.expression',
 'Freedom.of.religion.and.belief',
 'Freedom.of.the.press',
 'HIV...Aids',
 'Human.rights.defenders',
 'Human.rights.education.and.training',
 'Human.rights.violations.by.state.agents',
 'Impunity',
 'Indigenous.peoples',
 'Internally.displaced.persons',
 'International.humanitarian.law',
 'International.instruments',
 'Justice',
 'Labour',
 'Migrants',
 'Minorities',
 'NHRI',
 'National.plan.of.action',
 'Poverty',
 'Public.security',
 'Racial.discrimination',
 'Right.to.education',
 'Right.to.food',
 'Right.to.health',
 'Right.to.housing',
 'Right.to.land',
 'Right.to.water',
 'Rights.of.the.Ch

## Pipelines

In [15]:
# build a pipeline - SVC
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', OneVsRestClassifier(LinearSVC(random_state=0)))
                     ])

In [16]:
# fit using pipeline
clf = text_clf.fit(train_text, train_target)

In [37]:
# # build a pipeline - TREE
# from sklearn.pipeline import Pipeline
# text_clf_tree = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', OneVsRestClassifier(tree.DecisionTreeClassifier()))
#                      ])

In [38]:
# fit using pipeline
# clf_tree = text_clf_tree.fit(train_text, train_target)

## Predicting

In [17]:
# predict
predicted = clf.predict(test_text)
clf.score(test_text, test_target) 

0.76682296286909135

In [26]:
predicted == test_target

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ..., 
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ..., False,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]], dtype=bool)

In [18]:
# mean agreement
np.mean(predicted == test_target)

0.99414864061296704

In [25]:
predicted

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [19]:
# print labels
for doc, labels in zip(list(test_text[:50]), predicted[:50]):
    print('%r => %s' % (doc, ", ".join(list(np.array(target_names)[labels==1]))))

'Ratify the Convention on the Elimination of All Forms of Discrimination against Women' => International.instruments, Women.s.rights
'Promptly approve legislation establishing an NHRI in full compliance with the Paris Principles' => NHRI
'Establish an independent national human rights institution in accordance with the Paris Principles.' => NHRI
'Accede to the Rome Statute of the International Criminal Court' => International.instruments, Justice
'Ratify the International Convention for the Protection of All Persons from Enforced Disappearance' => Enforced.disappearances, International.instruments
'End the kidnapping of persons, whatever their country of origin may be' => 
'Continue firmly to combat the scourge of sexual violence and impunity and bring to justice those responsible for grave human rights and international humanitarian law violations' => Impunity, International.humanitarian.law, Women.s.rights
'Intensify its efforts and measures to strengthen the rule of law' => Justice


In [20]:
# print metrics
print(metrics.classification_report(test_target, predicted,
    target_names=target_names)) 

                                              precision    recall  f1-score   support

                   Asylum.seekers...refugees       0.97      0.89      0.93       140
                         CP.rights...general       0.89      0.76      0.82       118
                               Civil.society       0.93      0.87      0.90       159
                                  Corruption       1.00      0.93      0.96        44
                           Counter.terrorism       1.00      0.53      0.69        47
                               Death.penalty       1.00      0.95      0.98       357
                                   Detention       0.94      0.87      0.91       487
                                 Development       0.87      0.68      0.76       183
                                Disabilities       0.96      0.94      0.95       290
                        ESC.rights...general       0.88      0.82      0.85       240
                                   Elections       0.

In [21]:
metrics.confusion_matrix(test_target, predicted, labels=target_names)

ValueError: multilabel-indicator is not supported

In [155]:
## cross validation
scores = cross_validation.cross_val_score(
   text_clf, text, target, cv=5)
scores

array([ 0.74135091,  0.73552148,  0.74071727,  0.72880497,  0.74832087])

In [156]:
from sklearn.metrics import coverage_error
coverage_error(test_target, predicted)

11.443923457103029

## Parameters

In [22]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
              'tfidf__use_idf': (True, False),
}

In [158]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [159]:
gs_clf = gs_clf.fit(train_text, train_target)

In [160]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [161]:
score

0.73400076035990369

In [162]:
gs_clf.grid_scores_

[mean: 0.73020, std: 0.00459, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True},
 mean: 0.73400, std: 0.00256, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True},
 mean: 0.72288, std: 0.00275, params: {'vect__ngram_range': (1, 3), 'tfidf__use_idf': True},
 mean: 0.71623, std: 0.00140, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False},
 mean: 0.72500, std: 0.00309, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False},
 mean: 0.72266, std: 0.00339, params: {'vect__ngram_range': (1, 3), 'tfidf__use_idf': False}]