In [30]:
#!/Users/jessica/anaconda/lib/python3.5

import sklearn
import pandas as pd
import numpy as np

In [31]:
debating_data = pd.read_csv("/Users/jessica/GitHub/data-science/data/debatingmotions_sorted.csv")
header = debating_data.columns.values
debating_data = np.array(debating_data)
debating_data.shape

(1080, 22)

In [32]:
debating_data[:total_labelled,16]

array(['THBT social disgust is sufficient justification for censoring art.',
       'THR the Rise of the Rom-Com Genre.',
       'THBT after 50 years of ownership the state should have the right to repossess all works of painting or sculpture for the purpose of public display',
       'THW introduce a tax on "low brow" entertainment in order to subsidise "high brow" art.',
       'TH regrets the rise of art that celebrates gaining material wealth',
       'THW prohibit private ownership of art deemed to be culturally or historically significant',
       'Assuming there are actors who fit the criteria: THIS HOUSE BELIEVES That film directors have a moral obligation to cast actors who share key lived experiences with the characters they play.',
       'THS Chris Rock’s continued commitment to host the Oscars',
       'TH would prohibit businesses from taking a public position on social issues',
       'THW disallow exclusive private arbitration for contracts between corporations and indi

In [33]:
# Sheet variables
total_labelled = 188
train_size = 120
test_size = total_labelled - train_size
motion_col = 16
infoslide_col = 17
category_col_start = 18
category_col_end_plus_one = 21

In [34]:
# Shuffle labelled data 
labelled_data = debating_data[:total_labelled,:]
unlabelled_data = debating_data[total_labelled:,:]

# Check unlabelled data really is unlabelled
print('Is this unlabelled?', '\n', unlabelled_data[0], '\n')

# Shuffle data
np.random.shuffle(labelled_data)
np.random.shuffle(unlabelled_data)
print(labelled_data[:3])

Is this unlabelled? 
 ['2016-01-23' 'IoNA' 'United Kingdom' 0 'York IV' 'Bethany Garry'
 'Jennie Hope' 'Nissim Massarano' nan nan nan nan nan nan '2' '2'
 'This house believes that the Labour Party should have worked to rehabilitate Tony Blair’s image in its campaigning, prior to the 2015 General Election.'
 nan nan nan nan nan] 

[['2016-01-22' 'North America and Canada' 'Canada' 0 'NorthAms'
  'Veenu Goswami' 'Alex Mechanick' 'Matt Summers' 'Aislin Flynn' nan nan
  nan nan nan '4' '4' 'THW ban eminent domain rights' nan 'Freedoms' nan
  nan nan]
 ['2014-08-19' 'Europe' 'Zagreb, Croatia' 2 'Zagreb EUDC'
  'Rebecca Meredith' 'Milan Vignjevic' 'Alon van Dam' 'Ashish Kumar'
  'Engin Arikan' 'Maja Cimerman' 'Sella Nevo' 'Tomas Beerthuis' nan '2' '2'
  'This house supports two-party political systems over multi-party political systems.'
  nan 'Politics' nan nan nan]
 ['2015-08-03' 'Europe' 'Austria' 2 'EUDC' 'Christine Simpson'
  'Michael Shapira' 'Amanda Moorghen' 'Emilia Carlqvist' 'Gavi

### Convert labels to binary vectors

In [35]:
Z = labelled_data[:,category_col_start:category_col_end_plus_one]

In [36]:
labels = [
'Art and Culture',
'Business',
'Criminal Justice System',
'Development',
'Economics','Education',
'Environment',
'Family',
'Feminism',
'Freedoms',
'Funny',
'International Relations',
'LGBT+',
'Media',
'Medical Ethics',
'Minority Communities',
'Morality',
'Politics',
'Religion',
'Science and Technology',
'Security, War and Military',
'Social Policy',
'Social Movements',
'Sports',
'Terrorism',
'The Human Experience'
]

len_labels = len(labels)
print(len_labels)

26


In [37]:
Y_vector = np.zeros(shape=(total_labelled,len_labels))
for i in range(total_labelled):
    for j in range(category_col_end_plus_one - category_col_start):
        for k in range(len_labels):
            if Z[i,j] == labels[k]:
                Y_vector[i,k] = 1

In [38]:
# Transform Ys into binary
Y_dict = dict.fromkeys(labels)
for i in range(len_labels):
    Y_dict[labels[i]] = Y_vector[:,i]

In [39]:
Y_dict['Art and Culture']

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0

In [40]:
# Choose category to train for
chosen_element = 'International Relations'
Y_for_element = Y_dict[chosen_element]

In [41]:
# Configure training set

X = labelled_data[:train_size,motion_col]
Y = Y_for_element[:train_size]

In [42]:
X

array(['THW ban eminent domain rights',
       'This house supports two-party political systems over multi-party political systems.',
       'TH would introduce a mandatory term of at least 1 year of non-military national service for 18year olds',
       'TH regrets the decline in oil prices over the last 18 months',
       'THBT when faced with the choice between spending discretionary income on non-essentials or giving to charity, individuals should always choose to give to charity',
       'THBT judges should deliberately misinterpret constitutions to protect important liberties',
       'THW allow corporations to use hackers to retaliate against cyberattacks where the state seems unwilling or unable to do so',
       'THBT after 50 years of ownership the state should have the right to repossess all works of painting or sculpture for the purpose of public display',
       'THBT developing countries should collectivelly agree to radically redistributive tax policies. ',
       'This 

In [43]:
# Test set

X_test = labelled_data[train_size:total_labelled,motion_col]
Y_test = Y_for_element[train_size:total_labelled]

In [44]:
# Check that data is in form we expect
X[99]

'THBT the government should only fund further education courses that have high graduate employment rates'

In [45]:
Y

array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  1.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,
        0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,
        1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.])

## Extract Features 

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape

(120, 803)

In [47]:
count_vect.vocabulary_.get(u'house')

346

### Refinements to occurrence count
1. **tf**: To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

2. **tf-idf** (Term Frequency times Inverse Document Frequnecy) Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.




In [48]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(120, 803)

In [49]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(120, 803)

## Training a classifier

In [50]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y)

In [51]:
# Predict the outcome on a new document
docs_new = ['thw criminalise drugs', 'THW ban endangered animals',
           'ban drug taking in sports', 'THBT Saudi Arabia should nationalise its oil industry']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'thw criminalise drugs' => 0.0
'THW ban endangered animals' => 0.0
'ban drug taking in sports' => 0.0
'THBT Saudi Arabia should nationalise its oil industry' => 1.0


## Building a Pipeline

In [52]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])

In [53]:
# Hard-coded array

category_clfs = [text_clf, text_clf, text_clf, text_clf, 
                 text_clf, text_clf, text_clf, text_clf,
                 text_clf, text_clf, text_clf, text_clf,
                 text_clf, text_clf, text_clf, text_clf,
                 text_clf, text_clf, text_clf, text_clf,
                 text_clf, text_clf, text_clf, text_clf,
                 text_clf, text_clf]            

In [54]:
for i in range(len_labels):
    category_clfs[i] = text_clf.fit(X, Y_dict[labels[i]][:train_size])

In [55]:
# Create dictionary of category classifiers
category_clfs_dict = dict.fromkeys(labels)
for i in range(len_labels):
    category_clfs_dict[labels[i]] = text_clf.fit(X, Y_dict[labels[i]][:train_size])

In [56]:
## Evaluation of the performance on the test set
import numpy as np

for category_to_test in labels:
    predicted = category_clfs_dict[category_to_test].predict(X_test)
    print(category_to_test, np.mean(predicted == Y_dict[category_to_test][train_size:total_labelled]))

Art and Culture 0.941176470588
Business 0.941176470588
Criminal Justice System 0.882352941176
Development 0.941176470588
Economics 0.779411764706
Education 0.941176470588
Environment 0.955882352941
Family 0.985294117647
Feminism 0.941176470588
Freedoms 0.955882352941
Funny 1.0
International Relations 0.735294117647
LGBT+ 0.941176470588
Media 0.941176470588
Medical Ethics 0.970588235294
Minority Communities 0.955882352941
Morality 0.926470588235
Politics 0.823529411765
Religion 0.941176470588
Science and Technology 0.955882352941
Security, War and Military 0.838235294118
Social Policy 0.911764705882
Social Movements 0.955882352941
Sports 0.985294117647
Terrorism 0.970588235294
The Human Experience 0.970588235294


Note hard-coded train and test size in performance evaluation cell.

# Predict categories for unlabelled motions