<h2>Load data</h2>
The code is a modified version from the code in <a href="http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html">this</a> tutorial.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("../data/wikipedia_300.csv")
np_data = df.values

# Split data into X and y
X_raw = np_data[:,0:-1]
# Convert class label strings to integers
y_raw = np_data[:,-1]
encoder = LabelEncoder()
encoder.fit(y_raw)
y = encoder.transform(y_raw)

# Flatten input matrix to vector
X_raw = X_raw.ravel()
print("Examples: {}".format(X_raw.shape[0]))
print("Possible categories:",np.unique(y_raw),"encoded to",np.unique(y))

Examples: 300
Possible categories: ['Games' 'Programming'] encoded to [0 1]


<h2>Convert to bag of words</h2>

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

#count_vect = CountVectorizer(stop_words='english')
count_vect = CountVectorizer()
X = count_vect.fit_transform(X_raw)
print(X.shape)

(300, 51162)


<h2>Convert from occurences to frequencies</h2>

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer().fit(X)
X = tf_transformer.transform(X)
print(X.shape)

(300, 51162)


<h2>Function for evaluating model accuracy</h2>

In [5]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

def evaluate(model):
    print("-- Training data --")
    # train model on training dataset
    model.fit(X, y)
    # evaluate dataset
    y_pred = model.predict(X)
    # calculate accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(y, y_pred)
    print(conf_mx)
    print(classification_report(y, y_pred))
    
    print("")
    print("-- 5-fold CV --")
    # 5-fold CV
    y_pred = cross_val_predict(model, X, y, cv=5)
    # calculate accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Average accuracy: %.2f%%" % (accuracy * 100.0))
    # confusion matrix
    print("Confusion Matrix:")
    conf_mx = confusion_matrix(y, y_pred)
    print(conf_mx)
    print(classification_report(y, y_pred))

<h2>Naive Bayes</h2>

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

model = MultinomialNB(alpha=.01)
evaluate(model)

-- Training data --
Accuracy: 99.67%
Confusion Matrix:
[[149   1]
 [  0 150]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       150
           1       0.99      1.00      1.00       150

   micro avg       1.00      1.00      1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


-- 5-fold CV --
Average accuracy: 96.00%
Confusion Matrix:
[[144   6]
 [  6 144]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       150
           1       0.96      0.96      0.96       150

   micro avg       0.96      0.96      0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300



<h2>SVM</h2>

In [7]:
from sklearn import svm

model = svm.LinearSVC(random_state=42)
evaluate(model)

-- Training data --
Accuracy: 100.00%
Confusion Matrix:
[[150   0]
 [  0 150]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       150
           1       1.00      1.00      1.00       150

   micro avg       1.00      1.00      1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


-- 5-fold CV --
Average accuracy: 95.33%
Confusion Matrix:
[[140  10]
 [  4 146]]
              precision    recall  f1-score   support

           0       0.97      0.93      0.95       150
           1       0.94      0.97      0.95       150

   micro avg       0.95      0.95      0.95       300
   macro avg       0.95      0.95      0.95       300
weighted avg       0.95      0.95      0.95       300



<h2>Pipeline example</h2>

In [8]:
from sklearn.pipeline import Pipeline
X = X_raw.ravel()
model = Pipeline([('vect', CountVectorizer(stop_words='english')),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB(alpha=.01)),])
evaluate(model)

-- Training data --
Accuracy: 99.67%
Confusion Matrix:
[[149   1]
 [  0 150]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       150
           1       0.99      1.00      1.00       150

   micro avg       1.00      1.00      1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


-- 5-fold CV --
Average accuracy: 95.67%
Confusion Matrix:
[[143   7]
 [  6 144]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       150
           1       0.95      0.96      0.96       150

   micro avg       0.96      0.96      0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300

