In [1]:
#import stuff 

from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
########## STEP 1: DATA IMPORT AND PREPROCESSING ##########

# Here we're taking in the training data and splitting it into two lists: One with the text of
# each bill title, and the second with each bill title's corresponding category. Order is important.
# The first bill in list 1 should also be the first category in list 2.
training = [line.strip().split('|') for line in open('../data/bills_training.txt', 'r').readlines()]
text = [t[0] for t in training if len(t) > 1]
labels = [t[1] for t in training if len(t) > 1]

# A little bit of cleanup for scikit-learn's benefit. Scikit-learn models wants our categories to
# be numbers, not strings. The LabelEncoder performs this transformation.
# Assigns LabelEncoder to variable, encoder
encoder = preprocessing.LabelEncoder()
# Fits encoder to labels list. Returns an array. 
correct_labels = encoder.fit_transform(labels)

In [3]:
########## STEP 2: FEATURE EXTRACTION ##########

# CountVectorizer implements both tokenization and occurrence counting in a single class
# Tokenization is the process of rocess of breaking a stream of text up into words, 
# phrases, symbols, or other meaningful elements called tokens 
# The CountVectorizer gives a token id for each possible token, for instance using white space and punctuation as token
# separators. 
# Occurrence counting is counting the occurrence of tokens in a document 
# Stop_words is a parameter of the CountVectorizer. When set to 'english' a built-in stop word list for English is used
# Here, we are assigning the countvectorizer to a variable called vectorizer
vectorizer = CountVectorizer(stop_words='english')

# The fit_transform method learns the vocabulary dictionary and returns term-document matrix (array)
# The array has samples (which are lines of text separated by stop words) as its rows and features as it columns
# The array is filled with 0/1, 0 this feature is not in the sample, 1 it is. 
# Here, we fit the vectorizer on the text list. The vectorizer returns an array. 
# We assign this array to a variable called data
data = vectorizer.fit_transform(text)


In [4]:
########## STEP 3: MODEL BUILDING ##########

# DecisionTreeClassifier() that predicts the value of a target variable by learning simple decision rules inferred from 
# the data features
# Assign classifer to variable, model
model = DecisionTreeClassifier()

# Fit the model on the features array to predict the correct labels 
fit_model = model.fit(data, correct_labels)


In [5]:
# ########## STEP 4: EVALUATION ##########

# Evaluate our model with 10-fold cross-validation
# This means that we split the data into five sections 
# Then we split those five sections into five sections 
# Train on four sections, test on one section, repeat
# save scores to variable 

scores = cross_validation.cross_val_score(model, data, correct_labels, cv=5)

# Print the mean and standard deviation of the scores from cross validation
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))




Accuracy: 0.65 (+/- 0.05)


In [6]:
# ########## STEP 5: APPLYING THE MODEL ##########

# Samples to use in the model
docs_new = ["Public postsecondary education: executive officer compensation.",
            "An act to add Section 236.3 to the Education code, related to the pricing of college textbooks.",
            "Political Reform Act of 1974: campaign disclosures.",
            "An act to add Section 236.3 to the Penal Code, relating to human trafficking."
        ]

# Apply vectorizer to the test samples, save to variable, test_data
test_data = vectorizer.transform(docs_new)

# Loop through elements of docs_new 
# Print, string assigment, row i in docs_new --> string resulting from calling the labelencoder (assigned in step 1) 
# with classes attribute
# The classes attribute holds labels for each class
# Pass predictions of labels for test_data into the encoder
for i in range(len(docs_new)):
    print('%s -> %s' % (docs_new[i], encoder.classes_[model.predict(test_data.toarray()[i])]))
   

Public postsecondary education: executive officer compensation. -> ['Education']
An act to add Section 236.3 to the Education code, related to the pricing of college textbooks. -> ['Education']
Political Reform Act of 1974: campaign disclosures. -> ['Campaign Finance and Election Issues']
An act to add Section 236.3 to the Penal Code, relating to human trafficking. -> ['Crime']


