In [90]:
import os
import re
import csv
import sys
import random
import ast
from time import sleep
from pandas import DataFrame, to_numeric
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics, tree, cross_validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

csv.field_size_limit(sys.maxsize)

9223372036854775807

## Load and PreProcess

In [91]:
# read in full csv
recs = []
with open('data/upr.csv','r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        recs.append(row)
len(recs) # 41066 

41066

In [92]:
# turn into a list
for i in recs:
    issues = i['Issue'].split(',')
    i['Issue'] = [x for x in issues if x != 'Other' and x != 'General']       

In [95]:
# test
issues = []
for i in recs:
    for issue in i['Issue']:
        if issue in issues:
            pass
        else:
            issues.append(issue)
len(issues)

52

In [96]:
# remove empty texts
rec_sub = [i for i in recs if i['Issue']]
print("Number of recs:", len(rec_sub))

Number of recs: 39475


In [97]:
# turn to dataframe
data = DataFrame(rec_sub)
print(data.shape)

(39475, 2)


In [110]:
# get text and label data
text = data['Text'].values
labels = data['Issue'].values

In [117]:
# binarize labels
mlb = MultiLabelBinarizer()
labels_binary = mlb.fit_transform(labels)
print(labels_binary)

[[0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [118]:
# get training + test data
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    text, labels_binary, test_size=0.2, random_state=40)
print("Number of training data observations:", len(X_train))

Number of training data observations: 31580


In [121]:
# get target (label) names
label_names = list(mlb.classes_)
print(label_names)

['Asylum-seekers - refugees', 'CP rights - general', 'Civil society', 'Corruption', 'Counter-terrorism', 'Death penalty', 'Detention', 'Development', 'Disabilities', 'ESC rights - general', 'Elections', 'Enforced disappearances', 'Environment', 'Extrajudicial executions', 'Freedom of association and peaceful assembly', 'Freedom of movement', 'Freedom of opinion and expression', 'Freedom of religion and belief', 'Freedom of the press', 'HIV - Aids', 'Human rights defenders', 'Human rights education and training', 'Human rights violations by state agents', 'Impunity', 'Indigenous peoples', 'Internally displaced persons', 'International humanitarian law', 'International instruments', 'Justice', 'Labour', 'Migrants', 'Minorities', 'NHRI', 'National plan of action', 'Poverty', 'Public security', 'Racial discrimination', 'Right to education', 'Right to food', 'Right to health', 'Right to housing', 'Right to land', 'Right to water', 'Rights of the Child', 'Sexual Orientation and Gender Identi

## Pipelines

In [122]:
# build a pipeline - SVC
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', OneVsRestClassifier(LinearSVC(random_state=0)))
                     ])

In [123]:
# fit using pipeline
clf = text_clf.fit(X_train, y_train)

## Predicting

In [124]:
# predict
predicted = clf.predict(X_test)
clf.score(X_test, y_test) 

0.7584547181760608

In [126]:
# mean agreement
np.mean(predicted == y_test)

0.99388853704876501

In [131]:
for doc, label in zip(list(X_test[:50]), predicted[:50]):
    print('%r => %s' % (doc, ", ".join(list(np.array(label_names)[label==1]))))

"Take stronger measures to combat discrimination in both the public and private sectors while promoting greater women's participation at the highest levels of decision-making" => Women's rights
'Submit all pending reports to the respective United Nations treaty bodies, namely, the Committee on Economic, Social and Cultural Rights, the Human Rights Committee and the Committee on the Rights of the Child' => ESC rights - general, Treaty bodies
'Ratify CRPD' => Disabilities, International instruments
'Ensure that the new Constitution fully guarantees the right to freedom of religion or belief and the right to equality and non-discrimination in line with international standards' => Freedom of religion and belief
'Establish a moratorium on executions with a view to abolishing the death penalty' => Death penalty
'Provide effective guarantee for the rights of Roma in the fields of education, employment and housing' => Minorities, Right to education, Right to housing
'Continue intensifying effo

In [133]:
# print metrics
print(metrics.classification_report(y_test, predicted,
    target_names=label_names)) 

                                              precision    recall  f1-score   support

                   Asylum-seekers - refugees       0.95      0.80      0.87       120
                         CP rights - general       0.88      0.79      0.83       115
                               Civil society       0.93      0.88      0.91       155
                                  Corruption       0.95      0.85      0.90        47
                           Counter-terrorism       1.00      0.65      0.79        26
                               Death penalty       0.99      0.94      0.96       379
                                   Detention       0.93      0.89      0.91       471
                                 Development       0.80      0.58      0.67       179
                                Disabilities       0.96      0.95      0.96       283
                        ESC rights - general       0.90      0.80      0.85       228
                                   Elections       0.

In [136]:
## cross validation
scores = cross_validation.cross_val_score(
   text_clf, text, labels_binary, cv=5)
scores

array([ 0.74122863,  0.73552882,  0.7409753 ,  0.72868904,  0.74756175])

In [156]:
from sklearn.metrics import coverage_error
coverage_error(y_test, predicted)

11.443923457103029

## Parameters

In [137]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
              'tfidf__use_idf': (True, False),
}

In [138]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [139]:
gs_clf = gs_clf.fit(X_train, y_train)

In [140]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

tfidf__use_idf: True
vect__ngram_range: (1, 2)


In [141]:
score

0.7328372387587081

In [142]:
gs_clf.grid_scores_

[mean: 0.72970, std: 0.00508, params: {'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)},
 mean: 0.73284, std: 0.00197, params: {'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)},
 mean: 0.72217, std: 0.00214, params: {'tfidf__use_idf': True, 'vect__ngram_range': (1, 3)},
 mean: 0.71482, std: 0.00466, params: {'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)},
 mean: 0.72289, std: 0.00223, params: {'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)},
 mean: 0.72131, std: 0.00241, params: {'tfidf__use_idf': False, 'vect__ngram_range': (1, 3)}]