In [None]:
import os.path

is_production = os.environ.get('ENV') == 'production'
home = os.environ.get('HOME')

print('is production', is_production)

vectors_path = vectors_path = '/tmp/' if is_production else home + '/ml-data/fasttext/'

In [None]:
if not os.path.isfile(vectors_path + 'cc.ru.300.bin'):
    !wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.ru.300.bin.gz -P {vectors_path}
    !gunzip {vectors_path + 'cc.ru.300.bin.gz'}
        
if not os.path.isfile(vectors_path + 'cc.ru.300.vec'):
    !wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.ru.300.vec.gz -P {vectors_path}
    !gunzip {vectors_path + 'cc.ru.300.vec.gz'}

In [None]:
if is_production:
    import pymongo
    import pandas as pd
    from pymongo import MongoClient

    mongo_uri = os.environ['MONGO_URI']
    client = MongoClient(mongo_uri)
    db = client.cubes
    collection = db.events
    data = pd.DataFrame(list(collection.find({ "$where": "this.tags && this.tags.length > 0" }, { 'title': 1, 'text': 1, 'tags': 1, '_id': 0 })))
    data.to_csv('./data.csv')
else:
    !wget https://raw.githubusercontent.com/happylolonly/eventsfree-ml/master/notebooks/tags/data/data.csv

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('./data.csv')
df = df.replace(r'\\n',' ', regex=True) 

df['tags'] = df['tags'].map(eval) 
df['tags'] = df['tags'].map(lambda x: set(x))

df.head()

Remove lines where only 1 uniqie tag

In [None]:
from collections import Counter

counter = Counter()
for tags in df['tags'].values:
    for t in tags:
        counter[t] += 1
print(counter)

arr = []
for i in counter:
    if counter[i] > 2:
        arr.append(i);
        
print('total count:', len(counter), '\n >2 tags:', len(arr))

In [None]:
tags_white_list = set(arr)
df['tags'] = df['tags'].map(lambda x: set(x).intersection(tags_white_list))
df = df[df.tags.map(len) > 0]

df.shape

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
from stop_words import get_stop_words
nltk.download('wordnet')
stemmer = SnowballStemmer('russian')

import json
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def standardize_text(df, text_field):
    df[text_field] = df[text_field].apply(strip_tags)
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"[^а-яА-Яa-zA-Z]", " ")
    df[text_field] = df[text_field].str.replace(r"\s+", " ")
    
    df[text_field] = df[text_field].str.lower()
    
    stop_words = get_stop_words('russian')
    
    def stem (text):
        temp = []
        for word in text.split(" "):
            if len(word) > 2 and word not in stop_words:
                temp.append(stemmer.stem(word))
        return " ".join(temp)
        
            
    df[text_field] = df[text_field].apply(stem)
    
    return df

In [None]:
df = standardize_text(df, 'text')
df = standardize_text(df, 'title')

df.head()

In [None]:
rows = []

for row in df.to_dict(orient='record'):
    for tag in row['tags']:
        row_copy = row.copy()
#         print(tag)
#         if tag == 'лекция':
#             tag = 1
#         else:
#             tag = 0
        row_copy['tag'] = tag
        
        rows.append(row_copy)
        
df = pd.DataFrame(rows)

df['fulltext'] = df[['text', 'title']].apply(lambda x: ' '.join(x), axis=1)

# print(df.shape)
# print(df)
df.head(5)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3, stratify=df['tag'].values)

print(np.unique(train['tag'].values, return_counts=True))
print(np.unique(test['tag'].values, return_counts=True))

In [None]:
# %%bash

# pip install -U -q git+https://github.com/facebookresearch/fastText.git

import fastText

FIELD = 'fulltext'
train.head()

In [None]:
def save(df, path):
    with open(path, 'w+') as f:
        for _, row in df.iterrows():
            f.write('__label__{} {}\n'.format(row['tag'], row[FIELD]))

In [None]:
# save(df, '/tmp/train.data')
save(train, '/tmp/train.data')
print(train)

In [None]:
%%time

if not is_production:
    print('Start train not production')
    model = fastText.train_supervised('/tmp/train.data',
                                  pretrainedVectors=vectors_path + 'cc.ru.300.vec',
                                  dim=300,
                                  thread=15,
                                  epoch=50,
                                  verbose=8)
    
    model.quantize(qnorm=True, cutoff=100000)
    model.save_model('../../server/ml/tags/model/tags_model_new')
    print('Model updated')
    

In [None]:
# labels, probs = model.predict(test[FIELD].tolist(), k=5)
# labels = [ll[0].replace('__label__', '') for ll in labels]

In [None]:
# from sklearn.metrics import classification_report
# print(classification_report(test['tag'].values, labels))

## Production model

In [None]:
%%time

if is_production:
    print('Start train production')
    save(df, '/tmp/train.data')
    model = fastText.train_supervised('/tmp/train.data',
#                                   pretrainedVectors=vectors_path + 'cc.ru.300.vec',
                                  dim=300,
                                  thread=15,
                                  epoch=50,
                                  verbose=8)


    model.quantize(qnorm=True, cutoff=100000)
    model.save_model('../../server/ml/tags/model/tags_model_new')
    print('Production model updated')

In [None]:
labels, probs = model.predict(test[FIELD].tolist(), k=1)

In [None]:
mistakes = 0;
for i, item in enumerate(test['tags']):
    length = len(item)
    
    prediction = map(lambda x: x.replace('__label__', ''), labels[i][0:length:])
    prediction = set(prediction)
    
    if (item != prediction):
        print(item, prediction)
        mistakes += 1
        
print('Accuracy:', (test.shape[0] - mistakes) / test.shape[0])

In [None]:
mistakes = 0;
for i, item in enumerate(test['tags']):
    length = len(item)
    
    prediction = map(lambda x: x.replace('__label__', ''), labels[i][0:1:])
#     print(labels)
#     print(prediction)
    
#     print(prediction in item)
    
    is_mis = False
    for value in prediction:
        if (value not in item):
            is_mis = True
#     prediction = set(prediction)

    if (is_mis):
        mistakes += 1
#         print(prediction)
    
#     if (item != prediction):
#         print(item, prediction)
#         mistakes += 1
        
print('Accuracy:', (test.shape[0] - mistakes) / test.shape[0])