In [1]:
import os
import re
import csv
import sys
import random
import ast
from time import sleep
from pandas import DataFrame, to_numeric
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics, tree, cross_validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RandomizedLogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer

csv.field_size_limit(sys.maxsize)



131072

## Load and PreProcess

In [2]:
# read in full csv
recs = []
with open('data/upr.csv','r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        recs.append(row)
len(recs) # 41066 

41066

In [3]:
# turn into a list
for i in recs:
    issues = i['Issue'].split(',')
    i['Issue'] = [x for x in issues if x != 'Other' and x != 'General']       

In [4]:
# test
issues = []
for i in recs:
    for issue in i['Issue']:
        if issue in issues:
            pass
        else:
            issues.append(issue)
len(issues)

52

In [5]:
# remove empty texts
rec_sub = [i for i in recs if i['Issue']]
print("Number of recs:", len(rec_sub))

Number of recs: 39475


In [6]:
# turn to dataframe
data = DataFrame(rec_sub)
print(data.shape)

(39475, 2)


In [7]:
# get text and label data
text = data['Text'].values
labels = data['Issue'].values

In [8]:
# binarize labels
mlb = MultiLabelBinarizer()
labels_binary = mlb.fit_transform(labels)
print(labels_binary)

[[0 1 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [9]:
# get training + test data
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    text, labels_binary, test_size=0.2, random_state=40)
print("Number of training data observations:", len(X_train))

Number of training data observations: 31580


In [None]:
# get target (label) names
label_names = list(mlb.classes_)
print(label_names)

## Pipelines

In [None]:
# build a pipeline - SVC
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', OneVsRestClassifier(LinearSVC(random_state=0)))
                     ])

In [None]:
# fit using pipeline
clf = text_clf.fit(X_train, y_train)

## Predicting

In [None]:
# predict
predicted = clf.predict(X_test)
clf.score(X_test, y_test) 

In [None]:
# mean agreement
np.mean(predicted == y_test)

In [None]:
for doc, label in zip(list(X_test[:50]), predicted[:50]):
    print('%r => %s' % (doc, ", ".join(list(np.array(label_names)[label==1]))))

In [None]:
# print metrics
print(metrics.classification_report(y_test, predicted,
    target_names=label_names)) 

In [None]:
## cross validation
scores = cross_validation.cross_val_score(
   text_clf, text, labels_binary, cv=5)
scores

In [None]:
from sklearn.metrics import coverage_error
coverage_error(y_test, predicted)

## Parameters

In [None]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
              'tfidf__use_idf': (True, False),
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(X_train, y_train)

In [None]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
score

In [None]:
gs_clf.grid_scores_