In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('open/train.csv')

In [4]:
models = [('MultiNB', MultinomialNB(alpha=0.03)),
          ('Calibrated MultiNB', CalibratedClassifierCV(MultinomialNB(alpha=0.03), method='isotonic')),
          ('Calibrated BernoulliNB', CalibratedClassifierCV(BernoulliNB(alpha=0.03), method='isotonic')),
          ('Calibrated Huber', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=1e-4, max_iter=10000, tol=1e-4), method='sigmoid')),
          ('Logit', LogisticRegression(C=30))
         ]

clf = VotingClassifier(models, voting='soft', weights=[3,3,3,1,1])

In [5]:
vectorizer=TfidfVectorizer(token_pattern=r'\w{1,}', sublinear_tf=True, ngram_range=(1,2))

X_train = vectorizer.fit_transform(train.text.values)
authors = [0,1,2,3,4]

Y_train = train.author.apply(authors.index).values

In [6]:
Y_train

array([3, 2, 1, ..., 1, 3, 0], dtype=int64)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2)

In [6]:
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('MultiNB', MultinomialNB(alpha=0.03)),
                             ('Calibrated MultiNB',
                              CalibratedClassifierCV(base_estimator=MultinomialNB(alpha=0.03),
                                                     method='isotonic')),
                             ('Calibrated BernoulliNB',
                              CalibratedClassifierCV(base_estimator=BernoulliNB(alpha=0.03),
                                                     method='isotonic')),
                             ('Calibrated Huber',
                              CalibratedClassifierCV(base_estimator=SGDClassifier(loss='modified_huber',
                                                                                  max_iter=10000,
                                                                                  tol=0.0001))),
                             ('Logit', LogisticRegression(C=30))],
                 voting='soft', weights=[3, 3, 3, 1, 1])

In [7]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [8]:
predictions = clf.predict_proba(x_test)
print ("logloss: %0.3f " % multiclass_logloss(y_test, predictions))

logloss: 0.566 


In [9]:
test = pd.read_csv('open/test_x.csv', index_col=0)
X_test = vectorizer.transform(test.text.values)
results = clf.predict_proba(X_test)
pd.DataFrame(results, index=test.index, columns=authors).to_csv('NB_result.csv')