## Классификация отзывов на IMDB

In [1]:
# отключение ворнингов
import warnings
warnings.filterwarnings('ignore')

# импорт библиотек
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
import gc
import numpy as np
import pandas as pd

In [2]:
# чтение данных

def read_data(path_to_data):
    df = pd.read_csv(path_to_data)
    df['sentiment'] = df['sentiment'].map({'positive':0, 'negative':1})
    return df['review'], df['sentiment']

X, y = read_data('data/IMDB Dataset.csv')

In [3]:
# разделение данных на трейн и тест

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.3,
                                                   stratify=y)

print('Number of 1/0 classes elements in train: {}'.format(np.bincount(y_train)))
print('Number of 1/0 classes elements in test: {}'.format(np.bincount(y_test)))

Number of 1/0 classes elements in train: [17500 17500]
Number of 1/0 classes elements in test: [7500 7500]


Целевой класс сбалансирован, значит можно применить accuracy.

### Bag of words

In [4]:
# перевод данных в разреженные матрицы

cv = CountVectorizer(ngram_range=(1,2)) # использую биграммы

X_train_sparse = cv.fit_transform(X_train)
X_test_sparse = cv.transform(X_test)

### Обучение моделей

In [5]:
# буду обучать логистическую регресcию и SGDClassifier

logit = LogisticRegression(solver='lbfgs',random_state=17, n_jobs=-1)
sgd_clf = SGDClassifier(loss='log', max_iter=100, random_state=17, n_jobs=-1)

In [6]:
def cross_validation(classifier, X_train, y_train, cv=3, scoring='accuracy'):
    clf_name = str(classifier)
    clf_name = clf_name[:clf_name.index('(')]
    cv_scores = cross_val_score(classifier, X_train, y_train, scoring=scoring, cv=cv)
    print('Average accuracy on CV of {} is {}'.format(clf_name, np.mean(cv_scores)))
    
def fit_evaluate(classifier, X_train, X_test, y_train, y_test):
    clf_name = str(classifier)
    clf_name = clf_name[:clf_name.index('(')]
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    print('Accuracy score of {} is {}'.format(clf_name, accuracy_score(y_test, predictions)))
    return classifier

In [7]:
cross_validation(sgd_clf, X_train_sparse, y_train, scoring='accuracy', cv=5)                               

Average accuracy on CV of SGDClassifier is 0.8984285714285715


In [8]:
cross_validation(logit, X_train_sparse, y_train, scoring='accuracy', cv=5)                               

Average accuracy on CV of LogisticRegression is 0.9035142857142857


In [9]:
%%time

# обучу логистическую регрессию
fit_evaluate(logit, X_train_sparse, X_test_sparse, y_train, y_test)

Accuracy score of LogisticRegression is 0.9088
CPU times: user 852 ms, sys: 836 ms, total: 1.69 s
Wall time: 57.2 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [10]:
%%time

# обучу SGDClassifier
fit_evaluate(sgd_clf, X_train_sparse, X_test_sparse, y_train, y_test)

Accuracy score of SGDClassifier is 0.9064666666666666
CPU times: user 4.31 s, sys: 4.28 s, total: 8.59 s
Wall time: 2.61 s


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=17, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)

Модели показали сопоставимое качество. При этом, SGDClassifier обучился в разы быстрее.