## Compiling the complete UN dataset with input text, training classifiers with scikit-learn

In [1]:
import glob 

# working dir: UN/
# read all files in directory into a list, where 1 file = 1 list element
list_of_meetings = []
for filename in glob.glob('corpus/meeting_records_clean_final/*.txt'):
     with open(filename, 'r') as f:
         list_of_meetings.append(f.read())
         f.close()

In [2]:
len(list_of_meetings) # length of list should equal number of files = 1236 

1236

In [3]:
# clean text data within the list
list_of_meetings = [x.replace('\n', ' ') for x in list_of_meetings]
list_of_meetings = [x.replace('_', ' ') for x in list_of_meetings]

In [4]:
# create pandas Series from list 
import pandas as pd
meetingseries = pd.Series(list_of_meetings)

In [5]:
# create DF from Series
meetingframe = meetingseries.to_frame('meeting_text') # create df, name column "meeting_text"

In [6]:
# set index for dataframe merge
meetingframe['id'] = meetingframe.index

In [7]:
# read in working CSV file
# working dir: UN/
recs = pd.read_table('clean_records_copy.csv', sep=',')

In [8]:
# set index for df merge
recs['id'] = recs.index

In [9]:
# Merge on 'id' column
full_data = pd.merge(recs, meetingframe)

In [10]:
# drop 'id' column
full_data.drop('id', axis=1, inplace=True)

In [11]:
# explore & check dataframe
full_data.head()

Unnamed: 0,record,day,press_release,topic,year,category,outcome,region,record_number,meeting_text
0,S/PV.3326,6-Jan,SC/5770,Rwanda,1994,0,S/RES/893 (1994),4,3326,ÔªøUNITED\r NATIONS\r S\r Security Council\r P...
1,S/PV.3329,14-Jan,SC/5775,South Africa,1994,0,S/RES/894 (1994),4,3329,ÔªøUNITED\r NATIONS\r S\r Security Council\r P...
2,S/PV.3331,28-Jan,SC/5779,Middle East UNIFIL,1994,0,S/RES/895 (1994),3,3331,UNITED SNATIONS Security Council PROVISIONAL S...
3,S/PV.3332,31-Jan,SC/5780,Georgia,1994,0,S/RES/896 (1994),5,3332,UNITED SNATIONS Security Council PROVISIONAL S...
4,S/PV.3334,4-Feb,SC/5782,Somalia,1994,1,S/RES/897 (1994),4,3334,UNITEDUNITED SNATIONSNATIONS Security Council ...


## Training a Naive Bayes classifier using scikit-learn

In [12]:
# Train test split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(full_data.meeting_text, full_data.category, random_state=1)

In [13]:
X_train.shape # 927

(927,)

In [14]:
X_test.shape # 309

(309,)

In [15]:
# tokenizing text
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
vect = CountVectorizer(ngram_range=(5,5), stop_words = 'english', min_df=2) # set ngram range of 5
train_dtm = vect.fit_transform(X_train)
test_dtm = vect.transform(X_test)

In [18]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_dtm, y_train)

In [19]:
y_pred_class = nb.predict(test_dtm)

In [20]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class) # 0.75

0.75728155339805825

In [25]:
# 57% for ngram_range 1,2
# 66% with ngram_range 2,3 
# 69% with ngram_range 2,5
# 74% with ngram_range 2,5, stopwords included
# 70% with ngram_range 1,5, stopwords included
# 75% with ngram_range 5,5, stopwords included

In [26]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[126,  63],
       [ 12, 108]])

In [27]:
y_pred_prob = nb.predict_proba(test_dtm)[:, 1]
print y_pred_prob # to check
metrics.roc_auc_score(y_test, y_pred_prob) # 0.82

[  2.04026494e-205   1.40734966e-003   1.00000000e+000   9.97027111e-001
   1.00000000e+000   2.58601125e-098   3.98003291e-013   1.54813887e-052
   1.05754053e-045   1.00000000e+000   1.00000000e+000   9.83396225e-028
   1.43234278e-031   1.00000000e+000   1.00000000e+000   1.14944079e-009
   1.50890175e-067   1.00000000e+000   2.14226955e-089   9.81912299e-001
   1.00000000e+000   1.00000000e+000   8.50817877e-017   1.00000000e+000
   4.60969995e-038   1.00000000e+000   1.00000000e+000   1.00000000e+000
   4.65402516e-076   1.00000000e+000   1.00000000e+000   1.29945896e-084
   1.00000000e+000   1.00000000e+000   1.00000000e+000   2.89966014e-010
   1.00000000e+000   2.01657568e-010   2.72060552e-005   2.33892765e-063
   1.00000000e+000   8.53149977e-043   1.75974405e-265   1.00000000e+000
   2.67130560e-017   1.63627894e-015   8.89755056e-039   1.12216060e-047
   1.00000000e+000   1.45634535e-066   1.64615626e-044   1.39859585e-216
   1.00000000e+000   9.99544386e-001   1.00000000e+

0.82028218694885358

In [28]:
# false negatives
X_test[y_test < y_pred_class]

1041    United Nations S/PV.6512 Security Council Sixt...
737     United Nations S/PV.5401 Security Council Sixt...
581     United Nations S/PV.4819 Security Council Fift...
694     United Nations S/PV.5247 Security Council Sixt...
419     United Nations S/PV.4268 Security Council Fift...
940     United Nations S/PV.6116 Security Council Sixt...
1079    United Nations S/PV.6644 Security Council Sixt...
1024    United Nations S/PV.6451 Security Council Sixt...
897     United Nations S/PV.5946 Security Council Sixt...
527     United Nations S/PV.4667 Security Council Fift...
576     United Nations S/PV.4807 Security Council Fift...
868     United Nations S/PV.5829 Security Council Sixt...
873     United Nations S/PV.5848 Security Council Sixt...
909     United Nations S/PV.5992 Security Council Sixt...
1126    This record contains the text of speeches deli...
1189    United Nations S/PV.7086 asdfSecurity Council ...
771     United Nations S/PV.5507 Security Council Sixt...
741     United

In [29]:
# false positives
X_test[y_test > y_pred_class]

# more false negatives, fewer false positives
# higher specificity, lower sensitivity

169    United Nations S/PV.3700 96-86353 (E) *9686353...
309    United Nations S/PV.3965 99-85015 (E) This rec...
350    United Nations S/PV.4050 99-86016 (E) This rec...
49     United Nations S/PV.3430 94-86467 (E) This rec...
335    United Nations S/PV.4014 99-85468 (E) This rec...
6      United Nations S/PV.3343 94-85240 (E) This rec...
47     United Nations S/PV.3413 94-86158 (E) This rec...
422    United Nations S/PV.4282 Security Council Fift...
267    United Nations S/PV.3894 98-85490 (E) This rec...
325    United Nations S/PV.3999 99-85348 (E) This rec...
348    United Nations S/PV.4045 99-85792 (E) This rec...
134    United Nations S/PV.3619 96-85051 (E) This rec...
Name: meeting_text, dtype: object

## Most informative features:

In [30]:
# Code source: Stack Overflow
# http://stackoverflow.com/questions/11116697/how-to-get-most-informative-features-for-scikit-learn-classifiers
# Used to get most informative features for linear models in scikit-learn
# Step 1. Define function
def show_most_informative_features(vectorizer, classifer, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(nb.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)

In [31]:
# Step 2. Call function using our parameters:
print show_most_informative_features(vect, nb)

	-11.9945	000 people lost lives endless		-5.1121	kingdom great britain northern ireland
	-11.9945	04 2014 meeting called order		-5.1141	united kingdom great britain northern
	-11.9945	07 2014 meeting called order		-5.6974	great britain northern ireland united
	-11.9945	07 2014 president floor representative		-5.8568	northern ireland united states america
	-11.9945	10 20 meeting called order		-5.8568	britain northern ireland united states
	-11.9945	10 20 new york president		-6.0258	sent signature member delegation concerned
	-11.9945	10 2013 meeting called order		-6.0284	security council corrections submitted original
	-11.9945	10 despite present quiet israeli		-6.0284	records security council corrections submitted
	-11.9945	10 president like inform council		-6.0309	text printed official records security
	-11.9945	10 year capacity building plan		-6.0309	printed official records security council
	-11.9945	10 years security council äôs		-6.0309	official records security council correction

## Using logistic regression:

In [33]:
''' Logistic Regression '''

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(train_dtm, y_train)
y_pred_class = logreg.predict(test_dtm)
print metrics.accuracy_score(y_test, y_pred_class) # 84%

0.847896440129
