In [1]:
import numpy as np 
import pandas as pd
import os
import gc

# regex libraries
import re
import regex
import string

# tokenizer library
import codecs
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# standard directories
data_dir = '../input/kaggle_data'
plain_text_dir = '../input/plain_text'
fasttext_dir = '../fasttext'

### extract text

In [3]:
train = pd.read_csv(f'{data_dir}/train.csv', usecols=['title','description'])
train_input = train['title'].str.cat([
    train['description']
], sep=' ',na_rep='').astype(str).fillna('missing')

del train
gc.collect()

KeyboardInterrupt: 

In [6]:
train_active = pd.read_csv(f'{data_dir}/train_active.csv', usecols=['title','description'])
train_active_input = train_active['title'].str.cat([
    train_active['description']
], sep=' ',na_rep='').astype(str).fillna('missing')

del train_active
gc.collect()

7

In [7]:
test = pd.read_csv(f'{data_dir}/test.csv', usecols=['title','description'])
test_input = test['title'].str.cat([
    test['description']
], sep=' ',na_rep='').astype(str).fillna('missing')

del test
gc.collect()

7

In [8]:
test_active = pd.read_csv(f'{data_dir}/test_active.csv', usecols=['title','description'])
test_active_input = test_active['title'].str.cat([
    test_active['description']
], sep=' ', na_rep='').astype(str).fillna('missing')

del test_active
gc.collect()

7

In [16]:
def clean_text(text):
    text = bytes(text, encoding="utf-8")
    text = text.lower()
    text = re.sub(b'(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', b' ', text)
    text = re.sub(b'\s+(?=\d)|(?<=\d)\s+', b' ', text)
    text = text.replace(b"\b", b" ")
    text = text.replace(b"\r", b" ")
    text = regex.sub(b"\s+", b" ", text)
    text = str(text, 'utf-8')
    text = re.sub(r"\W+", " ", text.lower())
    return text

In [32]:
# create full corpus
text_input = train_input + train_active_input + test_input + test_active_input

In [38]:
text_input = list(pd.concat([train_input, train_active_input, test_input, test_active_input], axis=0).values)
text_input

['Кокоби(кокон для сна) Кокон для сна малыша,пользовались меньше месяца.цвет серый',
 'Стойка для Одежды Стойка для одежды, под вешалки. С бутика.',
 'Philips bluray В хорошем состоянии, домашний кинотеатр с blu ray, USB. Если настроить, то работает смарт тв /\nТорг',
 'Автокресло Продам кресло от0-25кг',
 'ВАЗ 2110, 2003 Все вопросы по телефону.',
 'Авто люлька В хорошем состоянии',
 'Водонагреватель 100 литров нержавейка плоский Электро водонагреватель накопительный на 100 литров Термекс ID 100V, плоский, внутренний бак из нержавейки, 2 кВт, б/у 2 недели, на гарантии.',
 'Бойфренды colins Бойфренды в хорошем состоянии.',
 'Платье 54 раз мер очень удобное',
 'Полу ботиночки замш натур.Бамбини По стельке 15.5см мерить приокский район. Цвет темнее чем на фото',
 '1-к квартира, 25 м², 2/2 эт. Сдается однокомнатная мебелированная квартира квартира. Ежемесячная плата 18 тыс.р. + свет.',
 'Джинсы ',
 'Атласы и Контурныя карты за 8 класс Атлас история нового времени,19 век./\nКонтурные карты

In [39]:
# preprocess corpus
text_output = [clean_text(x) for x in text_input]
train_output = [clean_text(x) for x in train_input] 
test_output = [clean_text(x) for x in test_input] 

In [40]:
def save_text(file_name, text_array):
    f = open(f'{plain_text_dir}/{file_name}.txt', 'w', encoding='utf-8')
    for item in text_array: f.write('%s\n' % item)

save_text('avito_text', text_output)
save_text('train_avito_text', train_output)
save_text('test_avito_text',  test_output)

### Tokenize

In [41]:
token_dir = '../input/tokens'

In [42]:
# load data
def read_comments(file_name):
    file_name = f'{plain_text_dir}/{file_name}.txt'
    text_list = [x for x in open(file_name, encoding='utf-8')]
    pattern = re.compile(u' |\n')
    text = {word for text in text_list for word in re.split(pattern, text.strip())}
    return text

# tokenize words
def tokenize(text_cleaned):
    word_tokenizer = Tokenizer(filters='', lower=False)
    word_tokenizer.fit_on_texts(text_cleaned)
    return list(word_tokenizer.word_index.keys())

# save data
def save_tokens(tokens):
    save_name = f'{token_dir}/avito_tokens.txt'
    file = codecs.open(save_name, 'w', 'utf-8-sig')
    for i in tokens: 
        file.write(i +'\n')
    file.close()  

In [43]:
text = read_comments('avito_text')
tokens = tokenize(text)
save_tokens(tokens)

In [44]:
with open(f'{token_dir}/avito_tokens.txt', encoding='utf-8') as token_text:
    tmp = [next(token_text) for x in range(50)]
print(tmp)

['\ufeffслезак\n', 'пoклeйкa\n', 'вощдушный\n', 'zxcv234\n', '1пинетки\n', 'невыпирает\n', 'водаканал\n', 'электроэпергии\n', 'ljm\n', 'фулатх\n', '7700тыс\n', 'хрущевка\n', 'обнавлениия\n', 'патиной\n', 'hkx\n', 'авподатчиком\n', 'адьютанты\n', '50х35х6\n', 'вoзрaст\n', 'скандинавскай\n', 'knortiba\n', 'углегорского\n', 't90\n', 'закрепленый\n', 'миротворческие\n', '33w\n', 'lidellen\n', '84341\n', 'рво1\n', 'шерек\n', 'окисляеться\n', 'подвязав\n', 'ниагара\n', 'новомосковская\n', 'производительdeceuninck\n', 'рарантируем\n', '800900\n', 'camorder\n', 'в4с\n', 'rj60\n', 'развешивай\n', 'квашеная3л\n', 'задниe\n', 'с2209\n', 'пневмогидравлическим\n', 'зуевскими\n', 'amvei\n', 'рм420\n', 'новомосковсго\n', 'эскадренный\n']


### create FastText embeddings on raw text

```
#!/bin/bash
DATA_DIR='../input'  
EMBED_DIR='../input/embeddings'  
TOKEN_DIR='../input/tokens'  

rm_first_line()
{
  FILE_NAME=$1
  tail -n +2 "${FILE_NAME}" > "tmp.txt"
  mv -f "tmp.txt" "${FILE_NAME}"
}

train_word_embeddings()
{
  INPUT_FILE="${DATA_DIR}/plain_text/avito_text.txt"
  OUTPUT_FILE="${EMBED_DIR}/avito_fasttext_300d"

  # train fasttext vectors
  D:/Dropbox/avito/FastText/fasttext skipgram -input "${INPUT_FILE}" -output "${OUTPUT_FILE}" \
  -minCount 1 -neg 25 -thread 7 -dim 300

  # raname and remove first line of file
  mv "${OUTPUT_FILE}.vec" "${OUTPUT_FILE}.txt"
  rm_first_line "${OUTPUT_FILE}.txt"
}

train_word_embeddings
```

### Impute missing embeddings

In [5]:
# remove header from vector file

In [3]:
import torch
from sklearn.preprocessing import normalize  
# import embedding_utils

In [5]:
embed_dir = '../input/embeddings'

In [23]:
local_list = [
  f'{embed_dir}/avito_fasttext_300d.txt',
]

In [24]:
# with open(f'{embed_dir}/avito_fasttext_300d.txt', encoding='utf-8') as token_text:
#     tmp = [next(token_text) for x in range(1)]
# print(tmp)

In [8]:
external_list = [
  f'{embed_dir}/cc.ru.300.txt', 
  f'{embed_dir}/wiki.ru.txt',
]

In [25]:
# with open(f'{embed_dir}/cc.ru.300.txt', encoding='utf-8') as token_text:
#     tmp = [next(token_text) for x in range(1)]
# print(tmp)

In [10]:
def get_save_name(local, external):
    def get_base_name(file_name):
        base = file_name.split('/')
        base = base[len(base)-1]
        base = re.sub('.txt', '', base)
        return base

    return f'{embed_dir}/imputed/'\
        f'{get_base_name(local)}_impute_'\
        f'{get_base_name(external)}.txt'

In [11]:
# parameter -neg is 5 so 25 # increase negative sampling increases accuracy toxic competition fasttext was fast 

In [30]:
def read_embedding(file_name):
    """
    Reads into memory a numpy array of words and a
    corresponding dictionary of numpy word vectors.
    Args:
      file_name: Name of the file to read.
    Returns:
      A tuple of word array and word vector dictionary.
    """

    f = open(file_name, encoding='utf-8')
    word_vectors = {}
    split_pattern = re.compile(u' |\n')
    
#     i = 0 
    for line in f:
        split_line = re.split(split_pattern, line.strip())
        word_vectors[split_line[0]] = np.asarray(split_line[1:], dtype='float32')
#         if i > 6: break 
#         i += 1

    return word_vectors


def write_embedding(vectors, save_name):
    """
    Saves external word vector to disk.
    Args:
      save_name: Name of the file to save.
    Returns:
      None
    """  
    fwrite = open(save_name, 'w', encoding='utf-8')
    for word, vec in vectors.items():
        fwrite.write(word + ' ' + ' '.join(vec.astype(str)) + '\n')
    fwrite.close()


def impute_missing(local_vectors, external_vectors, vectorized=False, use_gpu=False, chunk_size=500):
    """
    For each missing word in the external dataset, find the most
    similar word present in both the local and external dataset.
    Args:
    vectorized: If True the missing words are updated one at a time
      in a for loop, otherwise the computationis vectorized.
      (default is True).  Ignored if use_gpu=True.
    use_gpu: If True use gpu to do the computation. (default is False)
    chunk_size: number of words to process on the gpu at once 
    (defaults is 500)
    Returns:
    Dictionary of imputed word vectors.
    """
    
    import torch
    from sklearn.preprocessing import normalize

    # find missing words
    print(local_vectors.keys())
    local_words, external_words = list(local_vectors.keys()), list(external_vectors.keys())
    shared_words = np.intersect1d(local_words, external_words)

    missing_words = np.setdiff1d(local_words, external_words)

    # create reference matrix
    reference_matrix = np.array([local_vectors[w] for w in shared_words])
    reference_matrix = normalize(reference_matrix).T # word vectors are columns - comment out

    # create lookup matrix
    lookup_matrix = np.array([local_vectors[w] for w in missing_words])
    #n p.array breaking numbers not as an array
    try:
        lookup_matrix = normalize(lookup_matrix)
    except:
        print(lookup_matrix)
    
    # perform lookup
    if use_gpu:

        # setup
        n_lookups = lookup_matrix.shape[0]
        n_chunks = n_lookups//chunk_size+1

        # convert to numpy array to torch tensors
        dtype = torch.cuda.FloatTensor  
        def np2tc(x): return torch.from_numpy(x).type(dtype)
        reference_matrix_gpu = np2tc(reference_matrix)
    
        # iterate through chunks
        for i in range(n_chunks):
            chunk_indexs = slice(chunk_size*i, min(chunk_size*(i+1), n_lookups))
            similarity = torch.mm(np2tc(lookup_matrix[chunk_indexs]), reference_matrix_gpu)
            _, similar_indexs = torch.max(similarity, 1)
            similar_words = shared_words[np.array(similar_indexs)]
            for m,s in zip(missing_words[chunk_indexs], similar_words):
                external_vectors[m] = external_vectors[s] 

    else: 

        if vectorized:
            for w in missing_words:
                similarity = np.matmul(local_vectors[w], reference_matrix)
                similar_word = shared_words[np.argmax(similarity)]
                external_vectors[w] = external_vectors[similar_word]

        else: 
            similarity = np.matmul(lookup_matrix, reference_matrix)
            similar_words = shared_words[np.argmax(similarity, axis=1)]
            for m,s in zip(missing_words, similar_words):
                external_vectors[m] = external_vectors[s]

    # keep only local words
    imputed_vectors = local_vectors
    for w in imputed_vectors:
        imputed_vectors[w] = external_vectors[w]
  
    return imputed_vectors

In [None]:
for local_file in local_list:
  
    # load local vectors
    local_vectors = read_embedding(local_file)

    for external_file in external_list[::-1]:

        # load external vectors
        external_vectors = read_embedding(external_file)
        print('done')
        # impute missing vectors
        imputed_vectors = impute_missing(local_vectors, external_vectors, use_gpu=True)
    
        # save imputed
        save_name = get_save_name(local_file, external_file)
        write_embedding(imputed_vectors, save_name)