# Анализ тональности отзывов: sklearn - различные классификаторы

Загрузим размеченный корпус:

In [1]:
import re
import pandas as pd
import numpy as np

# Считываем данные
n = ['id', 'date', 'name', 'text', 'typr', 'rep', 'rtw', 'faw', 'stcount', 'foll', 'frien', 'listcount']
data_positive = pd.read_csv('data/positive.csv', sep=';', error_bad_lines=False, names=n, usecols=['text'])
data_negative = pd.read_csv('data/negative.csv', sep=';', error_bad_lines=False, names=n, usecols=['text'])

# Формируем сбалансированный датасет
sample_size = min(data_positive.shape[0], data_negative.shape[0])
raw_data = np.concatenate((data_positive['text'].values[:sample_size],
                           data_negative['text'].values[:sample_size]), axis=0)

labels = [1] * sample_size + [0] * sample_size


def preprocess_text(text):
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', text)
    text = re.sub('@[^\s]+', 'USER', text)
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()

data = [preprocess_text(t) for t in raw_data]

In [2]:
from sklearn.model_selection import train_test_split

features = raw_data

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=1)
 
features[0]

'@first_timee хоть я и школота, но поверь, у нас то же самое :D общество профилирующий предмет типа)'

Импортируем нужные нам модули

In [5]:
%matplotlib inline
import pylab 
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from IPython.core.display import display, HTML

### Оценка качества работы разных классификаторов

In [6]:
#
# Список классификаторов
#
classifiers = [
    LogisticRegression(max_iter=200, penalty="l2"),
    SGDClassifier(loss="hinge", penalty="l2"),
    MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3, 4)),
    RandomForestClassifier(n_estimators=60, max_depth=5),
    GradientBoostingClassifier(n_estimators=180, learning_rate=1.0, max_depth=4),
    DecisionTreeClassifier(),
    SVC(),
]

#
# Тренировка и оценка точности
#
result = []
for classifier in classifiers:
    classifier.fit(x_train, y_train)
    report = accuracy_score(x_test, classifier.predict(y_test))
    result.append({'class' : classifier.__class__.__name__, 'accuracy' : report})

display(HTML('<h2>Result</h2>'))
display(pd.DataFrame(result))

#
# Исследуем уровень важности предикторов
#
model = xgb.XGBClassifier()
model.fit(features, classes)

pylab.rcParams['figure.figsize'] = 3, 3
plt.style.use('ggplot')
pd.Series(model.feature_importances_).plot(kind='bar')
plt.title('Feature Importances')
plt.show()



ValueError: could not convert string to float: 'Эссен - город необычный ) много разного и главное подходит, местами, для славянской души http://t.co/q2H0o9YJxz'

In [5]:
%%time
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
    print(clf)
    print(cross_val_score(text_classifier(CountVectorizer(), TfidfTransformer(), clf()), texts, labels, cv=3).mean())

<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.7569221747363429
<class 'sklearn.svm.classes.LinearSVC'>
0.758829739138847
<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'>
0.718793270331486
Wall time: 55.5 s


### Подготовка классификатора, обученного на всех данных

In [6]:
clf_pipeline = Pipeline(
            [("vectorizer", TfidfVectorizer()),
            ("classifier", LinearSVC())]
        )

clf_pipeline.fit(texts, labels)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [7]:
print(clf_pipeline.predict(["люблю чесать собак очень плохо за ухом слона"]))

[1]


## Понижение размерности и ансамбли деревьев

In [8]:
%%time
from sklearn.decomposition import NMF, TruncatedSVD

v = CountVectorizer()
mx = v.fit_transform(texts)
mf = TruncatedSVD(10)
u = mf.fit_transform(mx)

Wall time: 8.89 s


In [9]:
%%time
for transform in [TruncatedSVD, NMF]:
    print(transform)
    print(cross_val_score(text_classifier(CountVectorizer(), transform(n_components=10), LinearSVC()), texts, labels).mean())


<class 'sklearn.decomposition.truncated_svd.TruncatedSVD'>




0.5793715411677247
<class 'sklearn.decomposition.nmf.NMF'>
0.5760478269439249
Wall time: 2min 57s





Если задать n_components=1000:

In [10]:
%%time
print cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), LinearSVC()),
                      texts, 
                      labels
                     ).mean()

SyntaxError: invalid syntax (<unknown>, line 1)

## Ансамбли деревьев на преобразованных признаках

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [None]:
%%time
print cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", TruncatedSVD(100)),
            ("classifier", RandomForestClassifier(100))
        ]),
    texts,
    labels
    )

Больше компонент и больше деревьев:

In [None]:
%%time
print(cross_val_score(text_classifier(CountVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean())

Tf*Idf вместо частот слов:

In [None]:
%%time
print(cross_val_score(text_classifier(TfidfVectorizer(), TruncatedSVD(n_components=1000), RandomForestClassifier(1000)),
                      texts, 
                      labels
                     ).mean())

## Совмещаем Tf*Idf и SVD

In [None]:
from sklearn.pipeline import FeatureUnion

estimators = [('tfidf', TfidfTransformer()), ('svd', TruncatedSVD(1))]
combined = FeatureUnion(estimators)

In [None]:
%%time
print(cross_val_score(
    Pipeline([
            ("vectorizer", CountVectorizer()),
            ("transformer", combined),
            ("classifier", LinearSVC())
        ]),
    texts,
    labels
    ))