Here I will to show how to use linear model stochastic gradient descent on multi-class classification/discrimination

import class sklearn.linear_model.SGDClassifier

In [2]:
from sklearn import metrics
import numpy as np
import sklearn.datasets
import re
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split



Define some functions to help us on preprocessing

In [3]:
# clear string
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

# because of sklean.datasets read a document as a single element
# so we want to split based on new line
def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        # python3, if python2, just remove list()
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

I included 6 classes in local/
1. adidas (wear)
2. apple (electronic)
3. hungry (status)
4. kerajaan (government related)
5. nike (wear)
6. pembangkang (opposition related)

In [5]:
# you can change any encoding type
trainset = sklearn.datasets.load_files(container_path = 'local', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['adidas', 'apple', 'hungry', 'kerajaan', 'nike', 'pembangkang']
25292
25292


In [6]:
# bag-of-word
bow = CountVectorizer().fit_transform(trainset.data)

#tf-idf, must get from BOW first
tfidf = TfidfTransformer().fit_transform(bow)

#hashing, default n_features, probability cannot divide by negative
hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)



#### loss function got {'modified_huber', 'hinge', 'log', 'squared_hinge', 'perceptron'}

default is hinge, will give you classic SVM

perceptron in linear loss

huber and log both logistic classifier

#### penalty got {'l1', 'l2'}, to prevent overfitting

l1 = MAE (mean absolute error)

l2 = RMSE (root mean square error)

#### alpha is learning rate

#### n_iter is number of epoch

In [7]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.87131844238
             precision    recall  f1-score   support

     adidas       0.96      0.85      0.90       279
      apple       0.99      0.82      0.90       434
     hungry       0.99      0.89      0.94      1060
   kerajaan       0.87      0.85      0.86      1436
       nike       0.93      0.82      0.87       303
pembangkang       0.77      0.91      0.83      1547

avg / total       0.88      0.87      0.87      5059





In [8]:
train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.867958094485
             precision    recall  f1-score   support

     adidas       0.93      0.83      0.88       306
      apple       0.99      0.78      0.87       451
     hungry       0.99      0.91      0.95      1043
   kerajaan       0.86      0.85      0.86      1406
       nike       0.97      0.77      0.86       321
pembangkang       0.76      0.91      0.83      1532

avg / total       0.88      0.87      0.87      5059





In [9]:
train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))



accuracy validation set:  0.849970349872
             precision    recall  f1-score   support

     adidas       0.95      0.85      0.90       313
      apple       0.98      0.77      0.87       478
     hungry       0.99      0.90      0.95      1046
   kerajaan       0.84      0.82      0.83      1377
       nike       0.98      0.77      0.87       310
pembangkang       0.73      0.88      0.80      1535

avg / total       0.87      0.85      0.85      5059



Always BOW got the highest accuracy among other vectorization

Now let we use linear model to do classifers, I will use BOW as vectorizer

In [10]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

svm = SGDClassifier(penalty = 'l2', alpha = 1e-3, n_iter = 10).fit(train_X, train_Y)
predicted = svm.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.859458390986
             precision    recall  f1-score   support

     adidas       0.94      0.83      0.88       312
      apple       0.97      0.78      0.87       459
     hungry       1.00      0.89      0.94      1044
   kerajaan       0.85      0.85      0.85      1407
       nike       0.96      0.76      0.85       313
pembangkang       0.75      0.90      0.82      1524

avg / total       0.87      0.86      0.86      5059





In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

sq_hinge = SGDClassifier(loss = 'squared_hinge', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = sq_hinge.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.844040324175
             precision    recall  f1-score   support

     adidas       0.96      0.82      0.88       306
      apple       0.97      0.80      0.88       486
     hungry       0.99      0.88      0.93      1033
   kerajaan       0.88      0.78      0.82      1384
       nike       0.96      0.79      0.87       320
pembangkang       0.70      0.91      0.79      1530

avg / total       0.87      0.84      0.85      5059





In [13]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

perceptron = SGDClassifier(loss = 'perceptron', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = perceptron.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

accuracy validation set:  0.865388416683
             precision    recall  f1-score   support

     adidas       0.94      0.87      0.90       305
      apple       0.96      0.82      0.89       471
     hungry       0.96      0.90      0.93      1056
   kerajaan       0.84      0.85      0.84      1395
       nike       0.95      0.82      0.88       333
pembangkang       0.78      0.88      0.83      1499

avg / total       0.87      0.87      0.87      5059





But how to get probability of our output?

Only applicable if your loss = {'log', 'modified_huber'} because both are logistic regression

In [16]:
train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)

mod_huber = SGDClassifier(loss = 'modified_huber', 
                                  penalty = 'l2', alpha = 1e-3, 
                                  n_iter = 10).fit(train_X, train_Y)
predicted = mod_huber.predict(test_X)
print('accuracy validation set: ', np.mean(predicted == test_Y))

# print scores
print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))

# get probability for first 2 sentence in our dataset
print(trainset.data[:2])
print(trainset.target[:2])
print(mod_huber.predict_proba(bow[:2, :]))

accuracy validation set:  0.872306779996
             precision    recall  f1-score   support

     adidas       0.93      0.85      0.88       312
      apple       0.98      0.82      0.89       442
     hungry       0.98      0.90      0.94      1040
   kerajaan       0.87      0.86      0.86      1384
       nike       0.96      0.78      0.86       346
pembangkang       0.78      0.91      0.84      1535

avg / total       0.88      0.87      0.87      5059

['Najib emulating Trump in using tweets to spread his politics of fear hatred and lies', 'Ministers mooted exit may be linked to Sabah snap polls']
[5, 5]
[[ 0.          0.          0.          0.04299312  0.          0.95700688]
 [ 0.          0.03000789  0.          0.03931672  0.01663784  0.91403755]]


