## Best model for now: 

model = RandomForestClassifier(n_estimators=100, class_weight= None, criterion='entropy', max_features='sqrt', random_state=123)

[0.99115753 0.9919968  0.99133321 0.99133818 0.9915882 ]
avg: 0.9914827862478924

Train: 0.9999779010926789
Test: 0.9925608234263946

## Импорт библиотек, назначение стоп-слов и констант

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
stops = set(stopwords.words('russian'))
KEYED_VECTORS_FILE = 'w2v_allwords_model'
WORD2VEC_FILE = "word2vec_all_words.model"
embedding_dim = 100
# w2v = Word2Vec(min_count=1, size=embedding_dim, workers=4)
# w2v.save(WORD2VEC_FILE)
# del w2v

## Функции для предобрабки данных

In [None]:
def striphtml(data):
    # Избавление от html-тегов
    p = re.compile(r'<.*?>')
    return p.sub('', data)

def preproc_data(df_input):
    # Избавление от html-тегов
    df_output = df_input.copy()
    df_output['name'] = df_output['name'].map(striphtml)
    df_output['description'] = df_output['description'].map(striphtml)
    
    return df_output
        
def prepare_w2v(path, update):
    # Обучение и дообучение word2vec
    def iterate_rows(df):
        print('loading model')
        model = Word2Vec.load(WORD2VEC_FILE)
        sentences = []
        for i, row in df.iterrows():
            if i % 10000 == 0:
                print("Currently on row: {}; Currently iterrated {}% of rows".format(i, (i + 1)/len(df.index) * 100))

            words_list = re.findall('\w+', row['name'])
            sentences.append([word.lower() for word in words_list if word.lower() not in stops])
            words_list = re.findall('\w+', row['description'])
            sentences.append([word.lower() for word in words_list if word.lower() not in stops])
    
        print('building vocab. update = ', update)
        model.build_vocab(sentences, update=update)    
        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        
        del sentences
        model.save(WORD2VEC_FILE)
        word_vectors = model.wv
        word_vectors.save(KEYED_VECTORS_FILE)
        print('\n')
        print('Vocab:', len(word_vectors.vocab))
        print('='*10)
        del model
        del word_vectors
           
    if path == 'other.csv':
#         594533
        df_1 = pd.read_csv(path, '\t', names=['name', 'description'], header=0, nrows=200000)
        iterate_rows(df_1)
        del df_1
        df_1 = pd.read_csv(path, '\t', names=['name', 'description'], header=0, skiprows=200000, nrows=200000)
        iterate_rows(df_1)
        del df_1
        df_1 = pd.read_csv(path, '\t', names=['name', 'description'], header=0, skiprows=400000)
        iterate_rows(df_1)
        del df_1
    else:
        df_1 = pd.read_csv(path, '\t', usecols=['name', 'description'])
        iterate_rows(df_1)
        del df_1

def get_avg_vector(sentence):
    # Получение распределенного представления предложения путем сложения векторов слов
    res = sum([w2v[word.lower()] for word in re.findall('\w+', sentence) if word.lower() not in stops and word.lower() in w2v.vocab.keys()])
    if isinstance(res, int):
        res = np.array([0]*embedding_dim)
    return res / len(res)

def create_vectorized_df(df_input):
    # Преобразование очищенного датафрейма в векторизованный
    df_output = df_input.copy()
    
    df_output['name'] = df_output['name'].map(get_avg_vector)
    df_output['description'] = df_output['description'].map(get_avg_vector)

    zipped = list(zip(*df_output['name']))
    for i in range(embedding_dim):
        df_output['name_x%s'%(i)] = zipped[i]
        
    zipped = list(zip(*df_output['description']))
    for i in range(embedding_dim):
        df_output['description_x%s'%(i)] = zipped[i]
    
    df_output.drop(['id', 'name', 'description'], axis=1, inplace=True)
    
    return df_output

## Обучение word2vec и полная предобрабка данных

In [None]:
df_train = pd.read_csv('train.csv', '\t')
df_train.head()

In [None]:
df = df_train.pipe(preproc_data)
df.head()
print(df_train.shape)
print(df.shape)

In [None]:
datasets = ['train.csv', 'test.csv', 'other.csv']
for i, path in enumerate(datasets):
    if i == 0: prepare_w2v(path, update=False)
    else: prepare_w2v(path, update=True)

In [None]:
# w2v = KeyedVectors.load(KEYED_VECTORS_FILE)

In [None]:
df_vectorized = df.pipe(create_vectorized_df)
df_vectorized.info()
df_vectorized.to_csv('vectorized_w2v_allwords.csv', index=False)

In [None]:
df_vectorized = pd.read_csv('vectorized_w2v_allwords.csv')
df_vectorized.head()

In [None]:
df_vectorized.describe()

In [None]:
print(df_vectorized.shape)
y = df_vectorized['target']
X = df_vectorized.drop(["target"], axis=1)
print(X.shape, y.shape)

In [None]:
from collections import Counter
Counter(y)

# Машинное обучение

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.externals.joblib import parallel_backend


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23)

## Поиск лучшего леса

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=123)

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

param_grid = {
    'class_weight':['balanced', None],
    'criterion':['entropy', 'gini'],
    'max_features':[None, 'log2', 'sqrt', ] 
}

grid_search = GridSearchCV(model, param_grid=param_grid, verbose=100, cv=cv, n_jobs=-1, scoring='roc_auc')

with parallel_backend('threading'):
    grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
model = grid_search.best_estimator_

predict = model.predict_proba(X_train)
score = roc_auc_score(y_train, predict[:, 1])
print('Train:', score)
predict = model.predict_proba(X_test)
score = roc_auc_score(y_test, predict[:, 1])
print('Test:', score)

In [None]:
model = RandomForestClassifier(n_estimators=100, class_weight= None, criterion='entropy', max_features='sqrt', random_state=123)
model.fit(X_train, y_train)

In [None]:
predict = model.predict_proba(X_train)
score = roc_auc_score(y_train, predict[:, 1])
print('Train:', score)
predict = model.predict_proba(X_test)
score = roc_auc_score(y_test, predict[:, 1])
print('Test:', score)

# Commit

In [None]:
df_test = pd.read_csv('test.csv', '\t')
df_test.head()

df = df_test.pipe(preproc_data)
print(df_test.shape)
print(df.shape)

df_vectorized = df.pipe(create_vectorized_df)
df_vectorized.info()
df_vectorized.to_csv('vectorized_test_allwords.csv', index=False)

In [None]:
df_vectorized = pd.read_csv('vectorized_test_allwords.csv')
df_vectorized.head()

In [None]:
X = df_vectorized.as_matrix()
X.shape

In [None]:
model = RandomForestClassifier(n_estimators=100, class_weight= None, criterion='entropy', max_features='sqrt', random_state=123)
model.fit(X, y)

In [None]:
subs = pd.read_csv('sampleSubmission.csv', ',')
subs.head()

In [None]:
predict = model.predict_proba(X)[:,1]
subs['target'] = predict
subs.to_csv('subs_allwords.csv', sep =',', index=False)
subs.head()

## P.S
Так же, был другой вариант предобработки: Средние вектора word2vec, умноженные на tf-idf скоры соответствуюших слов. Получилось чуть хуже, обидненько.