In [0]:
import pandas as pd
import numpy as np
import scipy.optimize as sc
import scipy.sparse
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import datetime
from tqdm import tqdm_notebook

In [80]:
train_data = pd.read_csv('new_train.csv', index_col=0, na_filter=False)
train_data.head()

Unnamed: 0_level_0,title,description,price,category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,картин,гобел размер 139х84см,1000.0,19
1,стул прессова кож,прод недор стул светл прессова кож стильн ножк...,1250.0,22
2,домашн мин бан,мин бан мб мин саун предназнач принят теплов п...,13000.0,37
3,эксклюзивн коллекц книг трансаэр подарок,прод эксклюзивн коллекц книг выпущен ограничен...,4000.0,43
4,ноутбук aser,прода ноутбук acer e5 511c2ta купл конц ноябр ...,19000.0,1


In [81]:
test_data = pd.read_csv('new_test.csv', index_col=0, na_filter=False)
test_data.head()

Unnamed: 0_level_0,title,description,price
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
489517,стоик журнальн стал,прод журнальн столик изготавлива стол полирова...,10000.0
489518,iphon 64gb,телефон хорош состоян комплект гарант салон со...,12500.0
489519,утеплител,теплопел лидер тепл толщин утеплител 20 мм 30 ...,250.0
489520,пальт демисезон,прод пальт женск букл отличн состоян длин изде...,1700.0
489521,samsung syncmaster t200n,условн рабоч проблем панел настройк монитор пе...,1000.0


In [82]:
categories = pd.read_csv('new_categories.csv', index_col=0)
categories.head()

Unnamed: 0_level_0,name,subname,subsubname,subsubsubname
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Бытовая электроника,Телефоны,iPhone,0
1,Бытовая электроника,Ноутбуки,0,0
2,Бытовая электроника,Телефоны,Samsung,0
3,Бытовая электроника,Планшеты и электронные книги,Планшеты,0
4,Бытовая электроника,"Игры, приставки и программы",Игровые приставки,0


Применим векторизацию текста Tfidf. Обучение:

In [83]:
description_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.1)
description_vectorizer.fit(train_data['description'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.1, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [84]:
title_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_df=0.1)
title_vectorizer.fit(train_data['title'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.1, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

Функция, которая соединяет векторизированные текстовые фичи и добавляет шкалированный price.

In [0]:
def CreateMatrix(data):
    scaler = MinMaxScaler()
    strings = list(data['description'])
    X1 = description_vectorizer.transform(strings)
    strings = list(data['title'])
    X2 = title_vectorizer.transform(strings)
    price = np.reshape(list(data['price']), (-1, 1))
    X3 = scipy.sparse.csr_matrix(scaler.fit_transform(price))
    return scipy.sparse.hstack([X1, X2, X3])

In [0]:
X = CreateMatrix(train_data)

In [87]:
X

<489517x499033 sparse matrix of type '<class 'numpy.float64'>'
	with 23155214 stored elements in COOrdinate format>

Разобьем данные на обучающую и тестовую выборки

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, train_data['category_id'], test_size=0.33)

Применим несколько линейных классификаторов с разными loss-функциями

In [89]:
clf_linear_log = linear_model.SGDClassifier(loss='log', alpha=0.000002, tol = 0.0001, verbose=1, n_jobs=2)
clf_linear_log.fit(X_train, y_train)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1-- Epoch 1

Norm: 67.10, NNZs: 498618, Bias: -6.934424, T: 327976, Avg. loss: 0.013929
Total training time: 0.46 seconds.
-- Epoch 2
Norm: 75.59, NNZs: 498618, Bias: -6.583935, T: 327976, Avg. loss: 0.008966
Total training time: 0.49 seconds.
-- Epoch 2
Norm: 61.81, NNZs: 498618, Bias: -6.925384, T: 655952, Avg. loss: 0.008373
Total training time: 0.92 seconds.
-- Epoch 3
Norm: 69.43, NNZs: 498618, Bias: -6.666651, T: 655952, Avg. loss: 0.005422
Total training time: 0.97 seconds.
-- Epoch 3
Norm: 60.38, NNZs: 498618, Bias: -6.952255, T: 983928, Avg. loss: 0.007867
Total training time: 1.37 seconds.
-- Epoch 4
Norm: 68.31, NNZs: 498618, Bias: -6.662677, T: 983928, Avg. loss: 0.005245
Total training time: 1.44 seconds.
-- Epoch 4
Norm: 60.00, NNZs: 498618, Bias: -6.946579, T: 1311904, Avg. loss: 0.007694
Total training time: 1.81 seconds.
-- Epoch 5
Norm: 67.49, NNZs: 498618, Bias: -6.690947, T: 1311904, Avg. loss: 0.005158
Total training time: 1.90 seconds.
-- Epoch 5
Norm: 59

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.9min


Norm: 125.23, NNZs: 498618, Bias: -4.250195, T: 327976, Avg. loss: 0.054819
Total training time: 0.50 seconds.
-- Epoch 2
Norm: 78.09, NNZs: 498618, Bias: -5.725332, T: 1311904, Avg. loss: 0.009695
Total training time: 1.93 seconds.
-- Epoch 5
Norm: 115.23, NNZs: 498618, Bias: -4.275612, T: 655952, Avg. loss: 0.031176
Total training time: 1.00 seconds.
-- Epoch 3
Norm: 77.52, NNZs: 498618, Bias: -5.766517, T: 1639880, Avg. loss: 0.009582
Total training time: 2.42 seconds.
-- Epoch 6
Norm: 112.41, NNZs: 498618, Bias: -4.385624, T: 983928, Avg. loss: 0.029089
Total training time: 1.50 seconds.
-- Epoch 4
Norm: 77.37, NNZs: 498618, Bias: -5.758311, T: 1967856, Avg. loss: 0.009487
Total training time: 2.91 seconds.
-- Epoch 7
Norm: 111.14, NNZs: 498618, Bias: -4.344625, T: 1311904, Avg. loss: 0.028257
Total training time: 1.99 seconds.
-- Epoch 5
Norm: 76.97, NNZs: 498618, Bias: -5.781539, T: 2295832, Avg. loss: 0.009412
Total training time: 3.39 seconds.
-- Epoch 8
Norm: 110.41, NNZs: 498

[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed:  2.2min finished


SGDClassifier(alpha=2e-06, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=2, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.0001,
       validation_fraction=0.1, verbose=1, warm_start=False)

In [90]:
y_pred_linear_log = clf_linear_log.predict(X_test)
accuracy_score(y_test, y_pred_linear_log)

0.8898793495149838

In [114]:
clf_linear_hinge = linear_model.SGDClassifier(loss='hinge', alpha=0.000005, tol = 0.001, verbose=1, n_jobs=2)
clf_linear_hinge.fit(X_train, y_train)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1-- Epoch 1

Norm: 36.45, NNZs: 21281, Bias: -2.692035, T: 327976, Avg. loss: 0.010767
Total training time: 0.30 seconds.
-- Epoch 2
Norm: 42.03, NNZs: 31401, Bias: -2.431941, T: 327976, Avg. loss: 0.007331
Total training time: 0.31 seconds.
-- Epoch 2
Norm: 30.82, NNZs: 28328, Bias: -2.209275, T: 655952, Avg. loss: 0.005524
Total training time: 0.56 seconds.
-- Epoch 3
Norm: 35.90, NNZs: 40759, Bias: -2.022025, T: 655952, Avg. loss: 0.003113
Total training time: 0.57 seconds.
-- Epoch 3
Norm: 29.01, NNZs: 32804, Bias: -1.994368, T: 983928, Avg. loss: 0.004809
Total training time: 0.81 seconds.
-- Epoch 4
Norm: 33.45, NNZs: 45123, Bias: -1.863073, T: 983928, Avg. loss: 0.002580
Total training time: 0.82 seconds.
-- Epoch 4
Norm: 28.40, NNZs: 35990, Bias: -1.863281, T: 1311904, Avg. loss: 0.004474
Total training time: 1.07 seconds.
-- Epoch 5
Norm: 32.65, NNZs: 48372, Bias: -1.781697, T: 1311904, Avg. loss: 0.002336
Total training time: 1.07 seconds.
-- Epoch 5
Norm: 27.75, NNZ

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.2min


Norm: 44.29, NNZs: 43933, Bias: -1.886634, T: 327976, Avg. loss: 0.013602
Total training time: 0.40 seconds.
-- Epoch 2
Norm: 65.15, NNZs: 89206, Bias: -1.636458, T: 327976, Avg. loss: 0.042720
Total training time: 0.41 seconds.
-- Epoch 2
Norm: 38.14, NNZs: 56698, Bias: -1.612635, T: 655952, Avg. loss: 0.007139
Total training time: 0.80 seconds.
-- Epoch 3
Norm: 55.56, NNZs: 113824, Bias: -1.447554, T: 655952, Avg. loss: 0.023272
Total training time: 0.80 seconds.
-- Epoch 3
Norm: 36.00, NNZs: 63717, Bias: -1.510001, T: 983928, Avg. loss: 0.006077
Total training time: 1.19 seconds.
-- Epoch 4
Norm: 52.20, NNZs: 128720, Bias: -1.341779, T: 983928, Avg. loss: 0.020359
Total training time: 1.20 seconds.
-- Epoch 4
Norm: 35.19, NNZs: 68965, Bias: -1.448826, T: 1311904, Avg. loss: 0.005606
Total training time: 1.58 seconds.
-- Epoch 5
Norm: 50.89, NNZs: 139505, Bias: -1.282272, T: 1311904, Avg. loss: 0.018995
Total training time: 1.59 seconds.
-- Epoch 5
Norm: 34.60, NNZs: 73218, Bias: -1.

[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed:  1.4min finished


SGDClassifier(alpha=5e-06, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=2, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=1, warm_start=False)

In [115]:
y_pred_linear_hinge = clf_linear_hinge.predict(X_test)
accuracy_score(y_test, y_pred_linear_hinge)

0.8920645532713057

In [93]:
clf_linear_huber = linear_model.SGDClassifier(loss='modified_huber', alpha=0.00001, tol = 0.0001,
                                              verbose=1, n_jobs=2, learning_rate='constant', eta0=0.01)
clf_linear_huber.fit(X_train, y_train)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1-- Epoch 1

Norm: 8.34, NNZs: 264113, Bias: -0.932344, T: 327976, Avg. loss: 0.022331
Total training time: 0.40 seconds.
Norm: 10.06, NNZs: 301830, Bias: -0.918425, T: 327976, Avg. loss: 0.023700
Total training time: 0.40 seconds.
-- Epoch 2
-- Epoch 2
Norm: 11.96, NNZs: 303558, Bias: -0.963340, T: 655952, Avg. loss: 0.008843
Total training time: 0.77 seconds.
-- Epoch 3
Norm: 9.58, NNZs: 267104, Bias: -0.967917, T: 655952, Avg. loss: 0.010914
Total training time: 0.78 seconds.
-- Epoch 3
Norm: 13.21, NNZs: 304212, Bias: -0.992001, T: 983928, Avg. loss: 0.007129
Total training time: 1.14 seconds.
-- Epoch 4
Norm: 10.49, NNZs: 268728, Bias: -0.993182, T: 983928, Avg. loss: 0.009725
Total training time: 1.17 seconds.
-- Epoch 4
Norm: 14.19, NNZs: 304608, Bias: -1.007236, T: 1311904, Avg. loss: 0.006199
Total training time: 1.52 seconds.
-- Epoch 5
Norm: 11.33, NNZs: 269861, Bias: -1.009290, T: 1311904, Avg. loss: 0.008990
Total training time: 1.56 seconds.
-- Epoch 5
Norm: 14.9

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  4.1min


Norm: 17.40, NNZs: 365059, Bias: -1.003482, T: 2951784, Avg. loss: 0.008822
Total training time: 3.69 seconds.
-- Epoch 10
Norm: 9.92, NNZs: 424459, Bias: -0.850459, T: 327976, Avg. loss: 0.061534
Total training time: 0.45 seconds.
-- Epoch 2
Norm: 17.91, NNZs: 365332, Bias: -1.014404, T: 3279760, Avg. loss: 0.008528
Total training time: 4.10 seconds.
-- Epoch 11
Norm: 13.54, NNZs: 434857, Bias: -0.883532, T: 655952, Avg. loss: 0.042249
Total training time: 0.89 seconds.
-- Epoch 3
Norm: 18.40, NNZs: 365586, Bias: -1.016792, T: 3607736, Avg. loss: 0.008270
Total training time: 4.48 seconds.
-- Epoch 12
Norm: 16.07, NNZs: 436150, Bias: -0.892345, T: 983928, Avg. loss: 0.037739
Total training time: 1.29 seconds.
-- Epoch 4
Norm: 18.86, NNZs: 365847, Bias: -1.022540, T: 3935712, Avg. loss: 0.008037
Total training time: 4.88 seconds.
-- Epoch 13
Norm: 18.10, NNZs: 436622, Bias: -0.908187, T: 1311904, Avg. loss: 0.035004
Total training time: 1.72 seconds.
-- Epoch 5
Norm: 19.28, NNZs: 36604

[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed:  4.9min finished


SGDClassifier(alpha=1e-05, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.01, fit_intercept=True,
       l1_ratio=0.15, learning_rate='constant', loss='modified_huber',
       max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=2,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       tol=0.0001, validation_fraction=0.1, verbose=1, warm_start=False)

In [94]:
y_pred_linear_huber = clf_linear_huber.predict(X_test)
accuracy_score(y_test, y_pred_linear_huber)

0.8931354888232709

Определим предсказание модели, как голосование среди обученных классификаторов:

In [0]:
Y = np.array([y_pred_linear_log, y_pred_linear_hinge, y_pred_linear_huber])

In [0]:
def voting(array):
    counter = Counter()
    for x in array:
        counter[x] += 1
    return counter.most_common()[0][0]
def ensemble(X):
    ans = np.copy(X[0, :])
    for i in range(len(ans)):
        ans[i] = voting(X[:, i])
    return ans

Итоговая точность на тестовой выборке

In [118]:
y_pred_ens = ensemble(np.array(Y))
accuracy_score(y_test, y_pred_ens)

0.8933892943587077

Применим классификатор к тестовому файлу

In [0]:
X_test_data = CreateMatrix(test_data)

In [0]:
y_test_pred_linear_log = clf_linear_log.predict(X_test_data)
y_test_pred_linear_hinge = clf_linear_hinge.predict(X_test_data)
y_test_pred_linear_huber = clf_linear_huber.predict(X_test_data)

In [0]:
Y_test_data = np.array([y_test_pred_linear_log, y_test_pred_linear_hinge, y_test_pred_linear_huber])

In [0]:
y_pred_test_data = ensemble(Y_test_data)

In [0]:
test_data['category_id'] = y_pred_test_data

In [113]:
test_data.head()

Unnamed: 0_level_0,title,description,price,category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
489517,стоик журнальн стал,прод журнальн столик изготавлива стол полирова...,10000.0,22
489518,iphon 64gb,телефон хорош состоян комплект гарант салон со...,12500.0,0
489519,утеплител,теплопел лидер тепл толщин утеплител 20 мм 30 ...,250.0,15
489520,пальт демисезон,прод пальт женск букл отличн состоян длин изде...,1700.0,33
489521,samsung syncmaster t200n,условн рабоч проблем панел настройк монитор пе...,1000.0,13


In [0]:
test_data.to_csv('updated_test.csv')