<h2>Load data</h2>
The code is a modified version from the code in <a href="http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html">this</a> tutorial.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("../data/wikipedia_300.csv")
np_data = df.values

# Split data into X and y
X_raw = np_data[:,0:-1]
# Convert class label strings to integers
Y_raw = np_data[:,-1]
encoder = LabelEncoder()
encoder.fit(Y_raw)
Y = encoder.transform(Y_raw)

# Flatten input matrix to vector
X_raw = X_raw.ravel()
print("Examples: {}".format(X_raw.shape[0]))
print("Possible categories:",np.unique(Y))

Examples: 300
Possible categories: [0 1]


<h2>Convert to bag of words</h2>

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

#count_vect = CountVectorizer(stop_words='english')
count_vect = CountVectorizer()
X = count_vect.fit_transform(X_raw)
print(X.shape)

(300, 51162)


<h2>Convert from occurences to frequencies</h2>

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer().fit(X)
X = tf_transformer.transform(X)
print(X.shape)

(300, 51162)


<h2>Function for evaluating model accuracy</h2>

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

def evaluate(model):
    print("-- Training data --")
    # train model on training dataset
    model.fit(X, Y)
    # evaluate dataset
    y_pred = model.predict(X)
    # calculate accuracy
    accuracy = accuracy_score(Y, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(Y, y_pred)
    print(conf_mx)
    
    print("")
    print("-- 5-fold CV --")
    # 5-fold CV
    y_pred = cross_val_predict(model, X, Y, cv=5)
    # calculate accuracy
    accuracy = accuracy_score(Y, y_pred)
    print("Average accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(Y, y_pred)
    print(conf_mx)

<h2>Naive Bayes</h2>

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

model = MultinomialNB(alpha=.01)
evaluate(model)

-- Training data --
Accuracy: 99.67%
Confusion Matrix:
[[149   1]
 [  0 150]]

-- 5-fold CV --
Average accuracy: 96.00%
Confusion Matrix:
[[144   6]
 [  6 144]]


<h2>SVM</h2>

In [6]:
from sklearn import svm

model = svm.LinearSVC(random_state=42)
evaluate(model)

-- Training data --
Accuracy: 100.00%
Confusion Matrix:
[[150   0]
 [  0 150]]

-- 5-fold CV --
Average accuracy: 95.33%
Confusion Matrix:
[[140  10]
 [  4 146]]


<h2>Pipeline example</h2>

In [7]:
from sklearn.pipeline import Pipeline
X = X_raw.ravel()
model = Pipeline([('vect', CountVectorizer(stop_words='english')),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB(alpha=.01)),])
evaluate(model)

-- Training data --
Accuracy: 99.67%
Confusion Matrix:
[[149   1]
 [  0 150]]

-- 5-fold CV --
Average accuracy: 95.67%
Confusion Matrix:
[[143   7]
 [  6 144]]
