In [7]:
%pylab inline

import pandas as pd
import numpy as np
import warnings

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from nltk import WordNetLemmatizer, word_tokenize, pos_tag
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from scipy.sparse import hstack

import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


In [8]:
wnl = WordNetLemmatizer()

def lemmatization_data(data):
    tokens_list = [word_tokenize(raw) for raw in data]
    tokens_list = [[wnl.lemmatize(t, 'v') for t in tokens] for tokens in tokens_list]
    
    return [' '.join(x) for x in tokens_list]

In [9]:
def submission_fit(model, X, y):
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, X, y, cv=kf)
    
    print(score)
    print(score.mean())
    print(score.std())
    
    model.fit(X, y)
    
    competition_data = pd.read_csv('data/products_sentiment_test.tsv', delimiter='\t')
    predictions = model.predict(lemmatization_data(competition_data.text))
    
    df = pd.DataFrame({'y': predictions})
    df.index.name = 'Id'
    df.to_csv('data/results2.csv')

In [10]:
def submission(model):
    competition_data = pd.read_csv('data/products_sentiment_test.tsv', delimiter='\t')
    predictions = model.predict(lemmatization_data(competition_data.text))
    
    df = pd.DataFrame({'y': predictions})
    df.index.name = 'Id'
    df.to_csv('data/results2.csv')

In [11]:
def get_train_and_test_data():
    train_data = pd.read_csv('data/products_sentiment_train.tsv', delimiter='\t', header=None)
    train_data.columns = ['text', 'class']

    counts = train_data['class'].value_counts()

    class_0 = int(counts[0] / 10)
    class_1 = int(counts[1] / 10)

    test_data = train_data[train_data['class'] == 0].sample(class_0)
    test_data = test_data.append(train_data[train_data['class'] == 1].sample(class_1))

    train_data = train_data.drop(test_data.index)

    X = train_data.text.values
    y = train_data['class'].values

    X = lemmatization_data(X)

    test_X = test_data.text.values
    test_y = test_data['class'].values

    test_X = lemmatization_data(test_X)
    
    return X, y, test_X, test_y

In [12]:
def get_train_data():
    train_data = pd.read_csv('data/products_sentiment_train.tsv', delimiter='\t', header=None)
    train_data.columns = ['text', 'class']

    return lemmatization_data(train_data.text.values), train_data['class'].values

Попробуем протестировать модели из списка на базовый параметрах для выявления кондидатов на дальнейшее рассмотрение

In [13]:
models = [LogisticRegression, SGDClassifier, Perceptron, PassiveAggressiveClassifier, RidgeClassifier, LinearSVC, SVC, MultinomialNB, KNeighborsClassifier, NearestCentroid, RandomForestClassifier]

In [14]:
X, y = get_train_data()

In [392]:
warnings.simplefilter("ignore")

X, y, test_X, text_y = get_train_and_test_data()

pipe = Pipeline([
    ('vectorizer', CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 5), analyzer='char')), 
    ('tfidf', TfidfTransformer(norm='l2', use_idf=True)),
    ('classifier', LogisticRegression(C=10, random_state=42))
])

param_grid = [
    {
        'vectorizer__max_features': [5000, 10000, 15000, 20000, 25000]
        'vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)],
        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
        'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }
]

kf = KFold(n_splits=5, shuffle=True)

grid = GridSearchCV(pipe, cv=kf, n_jobs=1, param_grid=param_grid, scoring='accuracy')
grid.fit(X, y)

print(grid.best_score_)
print()
for k in grid.best_params_:
    print(k, " : ", grid.best_params_[k])


prediction = grid.best_estimator_.predict(test_X)

print("Test score", accuracy_score(test_y, prediction))

0.790116601888

Test score 0.824120603015


0.79 on test

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

0.782343142699

vectorizer__max_features  :  20000

Test score 0.788944723618

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

0.786785119378

classifier__C  :  10

vectorizer__analyzer  :  char

vectorizer__ngram_range  :  (1, 5)

Test score 0.819095477387

In [373]:
grid.grid_scores_

[mean: 0.77290, std: 0.01281, params: {'vectorizer__max_features': 5000},
 mean: 0.77624, std: 0.01111, params: {'vectorizer__max_features': 10000},
 mean: 0.77957, std: 0.01156, params: {'vectorizer__max_features': 15000},
 mean: 0.78234, std: 0.01514, params: {'vectorizer__max_features': 20000},
 mean: 0.77901, std: 0.01444, params: {'vectorizer__max_features': 25000}]

In [394]:
warnings.simplefilter("ignore")

X, y, test_X, text_y = get_train_and_test_data()

pipe = Pipeline([
    ('vectorizer', CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 5), analyzer='char', max_features=20000)), 
    ('tfidf', TfidfTransformer(norm='l2', use_idf=True)),
    ('classifier', LinearSVC(C=1, random_state=42))
])

param_grid = [
    {
        'vectorizer__max_features': [5000, 10000, 15000, 20000, 25000]
        'vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)],
        'vectorizer__analyzer': ['word', 'char', 'char_wb'],
        'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }
]

kf = KFold(n_splits=5, shuffle=True)

grid = GridSearchCV(pipe, cv=kf, n_jobs=1, param_grid=param_grid, scoring='accuracy')
grid.fit(X, y)

print(grid.best_score_)
print()
for k in grid.best_params_:
    print(k, " : ", grid.best_params_[k])


prediction = grid.best_estimator_.predict(test_X)

print("Test score", accuracy_score(test_y, prediction))

0.772348695169

Test score 0.809045226131


0.79 on test

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

0.782343142699

vectorizer__max_features  :  20000

Test score 0.78391959799

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

0.789006107718

classifier__C  :  1

vectorizer__analyzer  :  char

vectorizer__ngram_range  :  (1, 5)

Test score 0.748743718593

In [389]:
submission(grid.best_estimator_)

### Выводы:

Лучший счет на нестовой выборке 0.815 что является мягко говоря не очень хорошим результатом.
Модель явно тербует доработки. Так же возможно что исходные данные требуют более детельного рассмотрения.
Но на данный момент принято решение отложить этот вопрос.