In [32]:
#!/Users/jessica/anaconda/lib/python3.5

import sklearn
import pandas as pd
import numpy as np

In [33]:
debating_data = pd.read_csv("data/debating_motions_aug9.csv")
header = debating_data.columns.values
debating_data = np.array(debating_data)
debating_data.shape

(1080, 22)

In [34]:
header

array(['Date', 'Circuit', 'Country', 'International', 'Tournament', 'CA_1',
       'CA_2', 'CA_3', 'CA_4', 'CA_5', 'CA_6', 'CA_7', 'CA_8',
       'Event_Link', 'Round_Code', 'Round', 'Motion', 'Infoslide',
       'Topic_Area_1', 'Topic_Area_2', 'Topic_Area_3',
       'Topic_Area_Specific_1'], dtype=object)

In [35]:
debating_data[0,18:21]

array(['Environment', 'Economics', nan], dtype=object)

In [36]:
debating_data[0,16:18]

array(['THW allow the trophy hunting of endangered animals',
       'Trophy hunting is the selective and regulated hunting of wild animals, for example lions. Typically, it takes place on reservations, particularly in Africa and North America.'], dtype=object)

In [37]:
motion_and_infoslide = [debating_data[0,16], debating_data[0,17]]
' '.join(motion_and_infoslide)

'THW allow the trophy hunting of endangered animals Trophy hunting is the selective and regulated hunting of wild animals, for example lions. Typically, it takes place on reservations, particularly in Africa and North America.'

In [None]:
### Convert labels to binary vectors

In [87]:
Z = debating_data[:150,18:21]

In [75]:
Z[0]

array(['Environment', 'Economics', nan], dtype=object)

In [85]:
labels = [
'Art and Culture',
'Business',
'Criminal Justice System',
'Development',
'Economics','Education',
'Environment',
'Family',
'Feminism',
'Freedoms',
'Funny',
'International Relations',
'LGBT+',
'Media',
'Medical Ethics',
'Minority Communities'
'Morality'
'Politics'
'Religion'
'Science and Technology'
'Security, War and Military'
'Social Policy'
'Social Movements'
'Sports'
'Terrorism'
'The Human Experience'
]

len(labels)

16

In [118]:
Y_vector = np.zeros(shape=(150,16))
for i in range(150):
    for j in range(3):
        for k in range(16):
            if Z[i,j] == labels[k]:
                Y_vector[i,k] = 1

In [119]:
# Transform Ys into binary
Y_dict = dict.fromkeys(labels)
for i in range(16):
    Y_dict[labels[i]] = Y_vector[:,i]

In [120]:
Y_dict['Art and Culture']

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [134]:
# Choose category to train for
chosen_element = 'International Relations'
Y_for_element = Y_dict[chosen_element]

In [135]:
# Configure training set

X = debating_data[:100,16]
# Previous:
# Y = debating_data[:100,18]
Y = Y_for_element[:100]

In [136]:
# Test set

X_test = debating_data[100:150,16]
Y_test = Y_for_element[100:150]

In [137]:
# Check that data is in form we expect
X[99]

'TH regrets the decline of secular pan-Arab nationalism'

## Extracting features from text files
Turn text content into numerical feature vectors.

### Bags of words
1. assign a fixed integer id to each word occurring in any document of the training set (for instance by building a dictionary from words to integer indices).
2. for each document #i, count the number of occurrences of each word w and store it in X[i, j] as the value of feature #j where j is the index of word w in the dictionary

In [138]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape

(100, 707)

In [139]:
count_vect.vocabulary_.get(u'house')

303

### Refinements to occurrence count
1. **tf**: To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

2. **tf-idf** (Term Frequency times Inverse Document Frequnecy) Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.




In [140]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(100, 707)

In [141]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(100, 707)

## Training a classifier

In [142]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, Y)

In [143]:
# Predict the outcome on a new document
docs_new = ['thw criminalise drugs', 'THW ban endangered animals',
           'ban drug taking in sports', 'THBT Saudi Arabia should nationalise its oil industry']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, category))

'thw criminalise drugs' => 0.0
'THW ban endangered animals' => 0.0
'ban drug taking in sports' => 0.0
'THBT Saudi Arabia should nationalise its oil industry' => 1.0


In [144]:
## Building a Pipeline

In [145]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                    ])

In [146]:
text_clf = text_clf.fit(X, Y)

In [147]:
## Evaluation of the performance on the test set
import numpy as np

predicted = text_clf.predict(X_test)
np.mean(predicted == Y_test)

0.76000000000000001