In [8]:
#http://sujitpal.blogspot.com/2013/08/sentence-genre-classification-using.html

import matplotlib.pyplot as plt
import re
from textblob import TextBlob
import pandas
import sklearn
import cPickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.learning_curve import learning_curve
from sklearn.ensemble import RandomForestClassifier
import pickle

In [9]:
def generate_xy(texts, labels):
  ftext = texts
  pipeline = Pipeline([
    ("count", CountVectorizer(stop_words='english', min_df=0.0,
              binary=False)),
    ("tfidf", TfidfTransformer(norm="l2"))
  ])
  X = pipeline.fit_transform(ftext)
  return X, y

In [10]:
# https://radimrehurek.com/data_science_python/

def split_into_lemmas(message):
    message = unicode(message, 'utf8').lower()
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma
    return [word.lemma for word in words]

# Create dataset that we interface using pandas
dataset = []
with open('../data/label-abstract-100.txt') as f:
    data = f.read().split('\n\n')
    for example in data:
        components = re.split("\<[a-z]+\>", example)
        if len(components) > 1:
            idnum = components[1].strip()
            category = components[2].strip()
            abstract = components[3].strip().replace('\n', ' ')
            dataset.append((idnum, category, abstract))
df = pandas.DataFrame(data = dataset, columns= ['ID', 'Category', 'Abstract'])

In [11]:
X = df.Abstract
y = df.Category
X, y = generate_xy(X, y)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
def train_model(X, y):
  model = RandomForestClassifier(n_jobs=10, n_estimators=1000, oob_score = True, random_state=25)
  model.fit(X, y)
  # reports
  ypred = model.predict(X)
  print "Confusion Matrix (Train):"
  print confusion_matrix(y, ypred)
  print "Classification Report (Train)"
  print classification_report(y, ypred)
  return model

def test_model(X, y, trained_model):
  model = trained_model
  if y is not None:
    # reports
    ypred = model.predict(X)
    print "Confusion Matrix (Test)"
    print confusion_matrix(y, ypred)
    print "Classification Report (Test)"
    print classification_report(y, ypred)

In [13]:
model = train_model(Xtrain, ytrain)
test_model(Xtest, ytest, model)

Confusion Matrix (Train):
[[84  0  0 ...,  0  0  0]
 [ 0 88  0 ...,  0  0  0]
 [ 0  0 90 ...,  0  0  0]
 ..., 
 [ 0  0  0 ..., 87  0  0]
 [ 0  0  0 ...,  0 87  0]
 [ 0  0  0 ...,  0  0  1]]
Classification Report (Train)
                    precision    recall  f1-score   support

          astro-ph       1.00      1.00      1.00        84
   cond-mat.dis-nn       1.00      1.00      1.00        88
 cond-mat.mes-hall       1.00      1.00      1.00        90
 cond-mat.mtrl-sci       1.00      1.00      1.00        91
    cond-mat.other       1.00      1.00      1.00        86
     cond-mat.soft       1.00      1.00      1.00        89
cond-mat.stat-mech       1.00      1.00      1.00        93
   cond-mat.str-el       1.00      1.00      1.00        93
 cond-mat.supr-con       1.00      1.00      1.00        89
             cs.AI       1.00      1.00      1.00        93
             cs.AR       1.00      1.00      1.00        95
             cs.CC       1.00      1.00      1.00        97

In [14]:
# p = model.predict_proba(Xtest)
# auc = sclearn.metrics.auc(ytest, p[:,1])