In [329]:
import pandas as pd
import numpy as np
import sklearn as sk
import itertools 
import functools
import nltk
import sklearn.feature_extraction.text as fex
import scipy.sparse
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score, precision_score,recall_score, median_absolute_error, r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import regression
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
import pymorphy2
import re

def get_pos_columns(df):
    return df[['noun', 'adjf', 'advb', 'verb', 'PROCESSED', 'PRODUCT', 'NAME', 'CATEGORY_ID', 
              'BRAND_ID', 'MYCATEGORY', 'CATEGORY_NAME', 'good_bad_ugly', 'BENEFITS', 'DRAWBACKS', 'RECOMMENDED',
              'LIKES_COUNT', 'DISLIKES_COUNT', 'RATING']]


def get_vocabulary_for_category(comment_by_category, min_df, max_df):
    words_by_cat = {k: df_comment_to_normal_words(v, [prepate_nots]) for k, v in comment_by_category.items()}
    vectorizers = {k: fex.TfidfVectorizer(min_df=min_df,
                                    max_df=max_df,
                                    sublinear_tf=True,
                                    use_idf=True) for k, v in words_by_cat.items()} 

    ready_vectorizers = {k: vectorizers[k].fit_transform(words_by_cat[k]) for k in words_by_cat.keys()}

    stop_words_ext = stop_words.union({'пылесос', 'это', 'очень', 'моет', 'посуду', 'это', 'очень', 'холодильник', 'очень', 'работает', 'это'
                     'холодильник', 'это', 'очень', 'работает', 'телефон', 'машинка', 'машинки', 'телевизор', 'стирает',
                                      'машинку', 'качество', 'ноутбук'})

    return {k: [i for i in sorted(v.vocabulary_.keys()) if i not in stop_words_ext] for k,v in vectorizers.items()}

def split_df_by_category(df, column_name):
    return {v: df[df[column_name]==v] for v in df[column_name].value_counts().keys()}

def classification(train, min_df, max_df, inp, out):
    vectorizer = fex.TfidfVectorizer(min_df=min_df, max_df=max_df, sublinear_tf=True, use_idf=True)
    words = [str(i) for i in train[inp]]
    rating = np.array(train[out])
    
    div_ratio = 0.9
    train_words, test_words = np.split(words, [int(div_ratio*len(words))])
    train_rating, test_rating = np.split(rating, [int(div_ratio*len(rating))])
    train_vectors = vectorizer.fit_transform(train_words).todense()
    test_vectors = vectorizer.transform(test_words).todense()

    classifier = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1,
                                       min_samples_leaf=int(len(train_rating)/2000))
    classifier.fit(train_vectors, train_rating)
    
    return classifier, vectorizer, train_vectors, test_vectors, train_rating, test_rating

def nan_or_importance(word, key, importance):
    imp = {k: v for k,v in importance[key]}
    keys = imp.keys()
    return np.nan if word not in keys else imp[word]


In [330]:
stop_words_df = pd.read_csv('/home/misha/stop_words.csv')
stop_words = set([str(w) for w in stop_words_df.word])

df_1 = pd.read_csv('/home/misha/Downloads/POS1.csv')
df_2 = pd.read_csv('/home/misha/Downloads/POS2.csv')
df_3 = pd.read_csv('/home/misha/Downloads/POS3.csv')
# Собирается датасет из частей (части считал отдельно в документах тк не паралелится)
df = pd.concat([get_pos_columns(df) for df in [df_1, df_2, df_3]])
# Словарь название товара: ключ


In [331]:
# разбиваем части речи по категориям
noun_by_category = {k: v.noun for k, v in split_df_by_category(df, 'good_bad_ugly').items()}
adjf_by_category = {k: v.adjf for k, v in split_df_by_category(df, 'good_bad_ugly').items()}
advb_by_category = {k: v.advb for k, v in split_df_by_category(df, 'good_bad_ugly').items()}
verb_by_category = {k: v.verb for k, v in split_df_by_category(df, 'good_bad_ugly').items()}

# создаем словари категория: набор слов
voc_nouns = get_vocabulary_for_category(noun_by_category, 0.002, 0.75)
voc_adjfs = get_vocabulary_for_category(adjf_by_category, 0.002, 0.75)
voc_advbs = get_vocabulary_for_category(advb_by_category, 0.002, 0.75)
voc_verbs = get_vocabulary_for_category(verb_by_category, 0.002, 0.75)

category_keys = [i for i in df.MYCATEGORY.value_counts().keys()]
dict_itemnum = {v: k for k, v in enumerate(df.NAME.value_counts().keys())}

# категории 0-1 в плохой-хороший
df['mark'] = df.good_bad_ugly.apply(lambda x: 0 if 'bad' in x else 1)

# создаем датафрейм для обучения лесов, чтобы затем найти самые важные фичи. 
# Отдельный лес обучается на каждой группе товаров
df_by_category = {key: df[df.MYCATEGORY==key][['noun', 
                                               'mark',
                                               'RATING',
                                               'good_bad_ugly',
                                               'NAME',
                                               'CATEGORY_ID']]
                  for key in category_keys}

In [323]:
category_features = {}
for key in category_keys:
    c, v, tr_v, te_v, tr_e, te_e = classification(df_by_category[key], 0.01, 0.4, 'noun', 'mark')
    category_features[key] = (c, v,  tr_v, te_v, tr_e, te_e)
    
prediction_fines = {}
for key in category_keys:
    c, v, tr_v, te_v, tr_e, te_e = category_features[key]
    p = c.predict(te_v)
    prediction_fines[key] = len([p[i] for i in range(len(te_e)) if p[i]==te_e[i]])/len(te_e)
    
feature_importances_by_category = {}
for key in category_keys:
    c = category_features[key][0]
    v = category_features[key][1]
    feat_imp = c.feature_importances_
    vocab = v.vocabulary_
    importance = {k: feat_imp[vocab[k]] for k in vocab.keys() }
    importance_pairs = sorted([(k,v)for k,v in importance.items()], key=lambda x: x[1])
    feature_importances_by_category[key] = importance_pairs
    

In [324]:
# собрать все слова для всех товаров в множество
word_set = set([item[0] for sublist in feature_importances_by_category.values() for item in sublist])

# проставить в колонки для разных товаров nan (если слово не встречается) или важность слова
df_ = pd.DataFrame()
df_['feature'] = pd.Series([w for w in word_set])
for key in category_keys:
    df_[key] = pd.Series([nan_or_importance(word, key, feature_importances_by_category) for word in word_set])
    
# Отметить фичи, которые встречаются во всех категориях, нормализовать на 0-100
df_['allin'] = df_.notnull().all(axis=1)
for key in category_keys:
    df_[key] = (df_[key] - df_[key].min()) / (df_[key].max() - df_[key].min()) * 100

#выбросиь такие фичи
#df_clean = df[df.allin != True]
df_ = df_.fillna(0)

df_.head()

Unnamed: 0,feature,tv,phone,laptop,cleaner,fridge,washmachine,dishwasher,allin
0,соня,7.563169,0.0,0.0,0.0,0.0,0.0,0.0,False
1,симка,0.0,5.345924,0.0,0.0,0.0,0.0,0.0,False
2,тормоз,4.711287,0.0,1.303652,0.0,0.0,0.0,0.0,False
3,жена,3.390638,1.257638,0.0,3.067374,0.662916,1.234252,3.447004,False
4,параметр,0.0,0.0,5.137151,0.0,0.0,0.0,0.0,False


In [228]:
import lime
import sklearn.pipeline
from lime.lime_text import LimeTextExplainer

In [285]:
c, v, tr_v, te_v, tr_e, te_e = classification(df_by_category['laptop'], 0.01, 0.4, 'adjf', 'mark')


In [332]:
import lime
import sklearn.pipeline
from lime.lime_text import LimeTextExplainer

cv = sklearn.pipeline.make_pipeline(v, c)
explainer = LimeTextExplainer(class_names=class_names)

def word_weight(cv, explainer, words):
    """word - текст (строка)"""
    exp = explainer.explain_instance(words, cv.predict_proba, num_features=100)
    return {k:v for k,v in exp.as_list}
    

In [312]:
class_names=[0,1]
explainer = LimeTextExplainer(class_names=class_names)

cv = sklearn.pipeline.make_pipeline(v, c)

words = df_by_category['laptop'].adjf
words = [str(w) for w in words if w != np.nan and len(str(w)) > 0 ]

exp_list = []
for idx in range(4, 10):
    exp_list.append(word_weight(cv, explainer, words[idx]))    

  return _compile(pattern, flags).split(string, maxsplit)


ValueError: low >= high

In [325]:
import pickle

with open('noun_classifiers.pkl', 'wb') as fid:
    pickle.dump({k: (v[0],v[1]) for k,v in category_features.items()}, fid)


In [None]:
# в файл, если надо
for key in category_keys:
    df = pd.DataFrame.from_records(feature_importances_by_category[key], columns=['feature', 'importance'])
    df.to_csv('/home/misha/Downloads/{}.csv'.format(key))