In [1]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [2]:
import numpy as np
import os
import nltk
import re
import pickle
from time import time
from gensim import corpora, models, similarities
from itertools import chain
from _datetime import datetime
from lxml import etree
import json
from nltk.corpus import stopwords as sp
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from math import log
from pprint import pprint as pp
from string import punctuation
from collections import defaultdict, deque, OrderedDict
import pandas as pd
import numpy
import logging
from sklearn.externals import joblib



**Парсинг дат**

In [3]:
import datetime

def parse_date(date_string, product_list=False):
    # '2009-01-01 08:25:07'
    # 2012-01-17T17:57:00-05:00
    # Friday November 10, 2006 10:49 pm PST
    # November 2, 2012 - product list
    if not product_list:
        if 'T' not in date_string:
            date = datetime.datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S').date()
        elif 'pst' in date_string.lower() or 'pdt' in date_string.lower():
            date = datetime.datetime.strptime(date_string[:-4], '%A %B %d, %Y %H:%M %p').date()
        else:
            date = datetime.datetime.strptime(date_string[:-6], '%Y-%m-%dT%H:%M:%S').date() 
    else:
        date = datetime.datetime.strptime(date_string, '%B %d, %Y').date()
    return date

def parse_dates(dates):
    return [parse_date(date) for date in dates]

**Препроцессинг**

In [200]:
def read_data(filename):
    data = pd.read_excel(filename)
    return data

with open('.\english_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = [word.strip('\n') for word in f.readlines()]
# stopwords = list(set(sp.words('english') + stopwords))
# lemmatizer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
numbers = re.compile('[0-9]')
punctuation += '\n—–- «»\'\"'
lemmatization_stoplist = ['3gs', '4s', '5s', '6s', '4', '5', '3', '2', '7']
product_lemm_reg = re.compile("(iphone|ipad|ipod|macbook|[ie]mac|air|5c|classic|nano|tv|ibook|shuffle|touch|mini|pro|powerbook)(s|es)")

def lemmatize(text, allow_stopwords=False):
    if allow_stopwords:
        current_stopwords = []
    else:
        current_stopwords = stopwords[:]
    lemmatized = [lemmatizer.lemmatize(word).lower() 
                  if not word in lemmatization_stoplist else word.lower() 
                  for word in nltk.word_tokenize(text) 
                    if word.lower() not in current_stopwords + list(punctuation) 
#                   and (numbers.search(word) is None or word in lemmatization_stoplist)
                    and word not in ["''", ' ', '``', '', "'s"]
                 ]
    product_lemmatized = [product_lemm_reg.sub('\\1', word) for word in lemmatized]
    return product_lemmatized

def preprocess_text(text, split=None, allow_stopwords=False):
    if split == 'sentence':
        sentences = nltk.sent_tokenize(text)
        return [lemmatize(sent, allow_stopwords) for sent in sentences]
    elif split == 'paragraph_sentence':
        try:
            paragraphs = [nltk.sent_tokenize(par) for par in re.split('[\r\n]{2,}', text) if not par.strip() == '']
        except:
            print(text)
        return [[lemmatize(sent, allow_stopwords) for sent in sents if not sent.strip() == ''] for sents in paragraphs]
    elif split == 'paragraph':
        paragraphs = [lemmatize(par, allow_stopwords) for par in re.split('[\r\n]{2,}', text)]
        return paragraphs
    else:
        return lemmatize(text)

def pickle_serialize(obj, filename):
    with open(filename, 'wb') as f:
        f.write(pickle.dumps(obj))    
    
def make_corpus(filename, serialize=None):
    data = read_data(filename)
    dates = parse_dates(data['datetime'])
    preprocessed_docs =  [[id, date, preprocess_text(body, split='paragraph_sentence')] for (id, body), date in zip(enumerate(list(data['body'])), dates) if not pd.isnull(body) and not body.strip() == '']
    if serialize:
        pickle_serialize(preprocessed_docs, serialize)
    return preprocessed_docs

def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.loads(f.read())

Загрузим и предобработаем все корпуса. 
Затем склеим их в один большой корпус, чтобы построить по нему тематическую модель.

**Подготавливаем данные для построения тематической модели**

In [5]:
from itertools import chain

def make_batches(corpus ,test=False):
    if test:
        first_batch_len = 100
    else:
        first_batch_len = 2000
    batch_id = 0
    batches = []
    prev_date = corpus[0][1]
    current_batch = []
    for id, date, paragraphs in corpus:
        if batch_id != 0:
            current_batch.extend([list(chain(*sents)) for sents in paragraphs])
            if len(current_batch) >= 30:
                batches.append([date, current_batch])
                current_batch = []
        else:
            if len(current_batch) < first_batch_len:
                current_batch.extend([list(chain(*sents)) for sents in paragraphs])
            else:
                prev_date = date
                batch_id += 1
    return batches

def make_dictionary(texts, test=False):
    dictionary = corpora.Dictionary(texts)
    if test:
        dictionary.filter_extremes(no_below=1, no_above=1000, keep_n=None)
    else:
        dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=None)
    print(dictionary)
    return  dictionary

def make_bow_texts(batch, dictionary):
    bow_texts = [dictionary.doc2bow(text) for text in batch]
    return bow_texts

**Подготовим данные для построения "классификатора утечек"**

Для начала загрузим модель, построенную на всей коллекции с трех сайтов

In [6]:
# mcr_model = models.LdaModel.load('./topic_models/models/400_topics.model')
# mcr_dict = corpora.Dictionary.load('./topic_models/models/400_topics.model.id2word')
# mcr_model.id2word = mcr_dict
# all_bows = make_bow_texts(all_docs, mcr_dict)
# cc_bows = make_bow_texts(cc, mcr_dict)

Сделаем функцию, которая отфильтровывает документы,  в которых встретилсь топики с заданными номерами и записывает их в файл

In [7]:
def find_topics(filename, model, doc_bow, docs, topic_id_list):
    filtered_docs = []
    topic_id_list = set(topic_id_list)
    wiritten = 0
    i = 0
    for bow in log_progress(doc_bow):
#         if i and i % 1000 == 0: 
#             print(i)
        cur_topics = model.get_document_topics(bow, minimum_probability=0.1)
        cur_topic_ids = set([id for id, prob in cur_topics])
        intersection = topic_id_list.intersection(cur_topic_ids)
        if intersection:
            best_topic = list(max([(id, prob) for id , prob in cur_topics if id in intersection], key=lambda x: -x[1]))
            filtered_docs.append([i, ' '.join(docs[i])] + best_topic) 
        i += 1
    pd.DataFrame(sorted(filtered_docs, key=lambda x: -x[-1])).to_excel(filename, index=False, header=['id', 'text', 'best_topic_id', 'probability'])

In [8]:
# leak_data = pd.read_excel('./topic_models/mcr_leaks_data_full_docs.xlsx')
# leak_data= leak_data.loc[leak_data['best_topic_id'].isin([334, 350, 126, 52])]
# leak_data['class'] = 1

In [9]:
# leak_data_filtered = leak_data.loc[(leak_data['probability']<0.6), ['text', 'class']]
# leak_data_filtered.shape

In [10]:
# leaks_ids = leak_data.id
# # non_leak_data = pd.DataFrame([[' '.join(par), 0] for i, par in enumerate(cc) if not i in leaks_ids], columns=['text', 'class'])
# non_leak_data = pd.DataFrame([[' '.join(par), 0] for i, par in enumerate(all_docs) if not i in leaks_ids], columns=['text', 'class'])
# non_leak_data.shape

In [11]:
# df = pd.concat([leak_data_filtered, non_leak_data.sample(frac=0.06)])
# df = df.sample(frac=1).reset_index(drop=True)
# df.head()
# del(leak_data, leak_data_filtered, non_leak_data)

In [12]:
# X = [t if not pd.isnull(t) else '' for t in list(df['text'])]
# y = df['class']
# # assert X.shape == y.shape
# # X.shape, y.shape

In [13]:
# del(df)

**Построим классификтор утечек**

In [14]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.cross_validation import cross_val_predict
# from sklearn.metrics import classification_report
# from sklearn.linear_model.logistic import LogisticRegression
# from sklearn.naive_bayes import GaussianNB, BaseNB, BernoulliNB
# from sklearn.svm import SVC

In [15]:
# leaks_vectorizer = TfidfVectorizer()
# X_tfidf = leaks_vectorizer.fit_transform(X)
# del(X)

In [16]:
# clf = LogisticRegression(verbose=1,solver='liblinear')
# y_pred = cross_val_predict(clf, X_tfidf, y, verbose=1, cv=4)
# print(classification_report(y, y_pred))

In [17]:
# leaks_classifier = LogisticRegression(solver='liblinear')
# leaks_classifier.fit(X_tfidf, y)

In [18]:
# joblib.dump(leaks_classifier, 'leaks_classifier.pkl')
# joblib.dump(leaks_vectorizer, 'leaks_vectorizer.pkl')

In [19]:
# del(X, y, X_tfidf, y_pred)

In [20]:
leaks_classifier = joblib.load('leaks_classifier.pkl')
leaks_vectorizer = joblib.load('leaks_vectorizer.pkl')

def is_leak(text, preprocess=False):
    if preprocess:
        text = preprocess_text(text)
    tf_idf_text = leaks_vectorizer.transform([' '.join(text)])
    return leaks_classifier.predict(tf_idf_text)[0]

is_leak('report says that', preprocess=True)

1

**Теперь функции для поиска упоминаний продуктов в текстах**

In [317]:
import re
def make_product_tuple(product_full_name):
    name_cleaned = ''.join([char for char in product_full_name.lower() if not char in '()\'"&']).split()
    parts_to_remove = ['gb', '+', 'with', 'wcdma', 'cdma', 'gsm', '16', '32', '64', '128']
    name_cleaned = tuple([re.sub('([0-9])(rd|st|nd|th)', '\\1', word) for word in name_cleaned if not word in parts_to_remove])
    return name_cleaned

def make_product_regex(product_full_name, min_req):
    product_tuple = make_product_tuple(product_full_name)
    product_regex = ''
    full_product_regex = ''
    for i in range(len(product_tuple)):
        if i < min_req:
            product_regex += '\ ({})'.format(product_tuple[i])
        else:
            product_regex += '(\ *{})?'.format(product_tuple[i])
        full_product_regex += '\ ({})'.format(product_tuple[i])
    modifiers = """(?P<new_products>  # a group for the words reffering to new and upcoming products
            (?:new(?:est)?|(?:up)?coming|to\ *?be\ *?ann?ounced|updated?|
            (next\ *?-?\ *?(?:gen(?:eration)?|year)?))\ *?
       )?
       (?P<launched_products> # a group for the words reffering to the old or already launched products
           (?:(?: (current|last|previous)\ *?-?\ *?(?:gen(?:eration)?|year)?)
           |(?:this\ *?-?\ *?(?:gen(?:eration)?|year)?))\ *?
       )?"""
    full_product_regex = "(?P<full_name> {})|".format(full_product_regex.strip('\ '))
    product_regex = full_product_regex.strip('\\ ') + modifiers + product_regex.strip('\ ')
    return re.compile(product_regex.strip(), re.X)

def read_product_list(product_list_file):
    models = defaultdict(lambda: defaultdict(list))
    model2family = {}
    product_list = read_data(product_list_file)
    for i, (year, launched, date_launched, model_full, keywords, inter_req, model_group, family) in product_list.iterrows():
        product = make_product_regex(keywords, inter_req)
        date = parse_date(date_launched, True)
        model_group = ' '.join(model_group.split())
        model2family[model_group] = family
        if not product in models[date][model_group]:
            models[date][model_group].append((product, inter_req))
    return models, model2family

In [322]:
def find_mention(tokenized_text, regex):
    text_string = ' '.join(tokenized_text)
    matches = regex.finditer(text_string)
    return matches

def delete_intersections(product_mentions):
    pm = sorted(product_mentions, key=lambda x: (x[2].start(), -x[2].end()))
    first_mention = pm[0]
    filtered = [first_mention]
    p_product, p_launch_date, (p_start, p_end) = first_mention[0], first_mention[1], first_mention[2].span()
    for mention in pm[1:]:
        product, launch_date, (start, end) = mention[0], mention[1], mention[2].span()
        last_added_span = tuple(filtered[-1][-1].span())
        if product != p_product and (end > p_end or start > p_end or (start, end) == last_added_span):
            filtered.append(mention)
    return filtered

def sort_by_date(product_mentions, article_date):
    launched_products =  []
    upcoming_products =  []
    for mention in product_mentions:
        product, launch_date, match = mention
        has_new_word = match.group('new_products')
        has_launched_word = match.group('launched_products')
        is_full_name = match.group('full_name')
#         print(is_full_name)
        if launch_date >= article_date:
            if not has_launched_word and has_new_word:
                upcoming_products.append(mention)
            elif is_full_name:
                upcoming_products.append(mention)
        elif launch_date < article_date:
            if not has_new_word and has_launched_word:
                launched_products.append(mention)
            elif (article_date - launch_date).days < 50 and is_full_name:
                upcoming_products.append(mention)
            else:
                if (article_date - launch_date).days < 50 and (has_new_word or is_full_name):
                    upcoming_products.append(mention)
    return launched_products, upcoming_products

def get_unique_mentions(mentions):
    return list(set([m[0] for m in mentions]))

def get_earliest(mentions, m2f):
    family2model = defaultdict(list)
    for mention in mentions:
        if mention:
            family = m2f[mention[0]]
            family2model[family].append(mention)
    earliest = [min(mentions, key=lambda x: x[1]) for family, mentions in family2model.items()]
    return earliest
        
def search_products(tokenized_sents, product_list, article_date):
    marked_text = ''
    launched_products =  []
    upcoming_products =  []
    for sent in tokenized_sents:
        mentions = []
        for launch_date in product_list:
            if -365 < (article_date - launch_date).days < 365:
                for product, regexs in product_list[launch_date].items():
                    for regex, _ in regexs:
#                         if '5' in regex.pattern: print(regex.pattern)
                        search_results = find_mention(sent, regex)  
                        mentions.extend([[product, launch_date] + [sr] for sr in search_results])
        if mentions:
            mentions = delete_intersections(mentions)
            cur_launched_products, cur_upcoming_products = sort_by_date(mentions, article_date)
            launched_products.extend(cur_launched_products)
            upcoming_products.extend(cur_upcoming_products)
    return launched_products, upcoming_products

# import datetime
apple_product_list, apple_m2f = read_product_list('.\\DATA\\Apple_produt_list_withour_dicsontinuation.xlsx')
text = "new iPhone 5"
launched, upcoming = search_products(preprocess_text(text, split='sentence'), apple_product_list, datetime.date(2012, 9, 1))
print(preprocess_text(text, split='sentence'))
print(upcoming)
print(launched)
# get_earliest(upcoming, apple_m2f)

[['new', 'iphone', '5']]
[['iPhone 5', datetime.date(2012, 9, 21), <_sre.SRE_Match object; span=(0, 12), match='new iphone 5'>]]
[]


In [316]:
text = preprocess_text("new iPhone 5", split='sentence')
print(" ".join(list(chain(*text))))
regex = """(?P<new_products>  # a group for the words reffering to new and upcoming products
            (?:new(?:est)?|(?:up)?coming|to\ *?be\ *?ann?ounced|updated?|
            (next\ *?-?\ *?(?:gen(?:eration)?|year)?))\ *?
       )?
       (?P<launched_products> # a group for the words reffering to the old or already launched products
           (?: (current|last|previous)\ *?-?\ *?(?:gen(?:eration)?|year)?)
           |(?:this\ *?-?\ *?(?:gen(?:eration)?|year)?)\ *?
       )?(iphone)(\ *5)?|(?P<full_name> (iphone)\ (5))
            """
test = re.compile(regex, re.X)
for match in test.finditer(" ".join(list(chain(*text)))):
    print(match)
    print(match.group('full_name'))

new iphone 5
<_sre.SRE_Match object; span=(0, 12), match='new iphone 5'>
None


Наш список продуктов

In [323]:
apple_product_list, apple_m2f = read_product_list('.\\DATA\\Apple_produt_list_withour_dicsontinuation.xlsx')

def find_apple_products(tokenized_sents, article_date):
    return [get_earliest(i, apple_m2f) for i in search_products(tokenized_sents, apple_product_list, article_date)]
apple_product_list

defaultdict(<function __main__.read_product_list.<locals>.<lambda>>,
            {datetime.date(2000, 2, 16): defaultdict(list,
                         {'PowerBook ("Pismo")': [(re.compile(r'(?P<full_name> (powerbook)\ (pismo))|(?P<new_products>  # a group for the words reffering to new and upcoming products\n            (?:new(?:est)?|(?:up)?coming|to\ *?be\ *?ann?ounced|updated?|\n            (next\ *?-?\ *?(?:gen(?:eration)?|year)?))\ *?\n       )?\n       (?P<launched_products> # a group for the words reffering to the old or already launched products\n           (?:(?: (current|last|previous)\ *?-?\ *?(?:gen(?:eration)?|year)?)\n           |(?:this\ *?-?\ *?(?:gen(?:eration)?|year)?))\ *?\n       )?(powerbook)\ (pismo)',
                            re.UNICODE|re.VERBOSE),
                            3)]}),
             datetime.date(2000, 7, 19): defaultdict(list,
                         {'Power Mac G4 Cube': [(re.compile(r'(?P<full_name> (power)\ (mac)\ (g4)\ (cube))|(?P<new_pro

**Анализ тональности**

In [24]:
from scipy.sparse import hstack
from pprint import pprint as pp
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import  TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.cross_validation import  cross_val_score, cross_val_predict
from sklearn.metrics import classification_report
from nltk.corpus import stopwords as sp
from nltk.stem import SnowballStemmer
from string import punctuation

In [25]:
stemmer = SnowballStemmer('english')

def stem(text):
    tokens = [word.strip(punctuation) for word in text.lower().split() if not word in stopwords]
    stems = [stemmer.stem(token) for token in tokens]
    return stems

In [26]:
# print('Reading data...')
# df_iphones = pd.read_csv('./DATA/Amazon.csv')
# df_laptops = pd.read_csv('./DATA/AmazonAppleLaptops.csv')
# df_ipads = pd.read_csv('./DATA/AmazonAppleiPads.csv')
# df_macs = pd.read_csv('./DATA/AmazonAppleMacs.csv')
# df = pd.concat([df_iphones, df_laptops, df_ipads, df_macs])

# print('Preprocessing...')
# texts = [stem(text) if not pd.isnull(text) else [''] for text in list(df.body)]

# print('Making tf-idf...')
# SA_vectorizer = TfidfVectorizer(lowercase=False, max_df=0.4, min_df = 20, ngram_range=(1,4), use_idf=False)
# X = SA_vectorizer.fit_transform([' '.join(text) for text in texts])
# y = [1 if int(star[0]) >= 4 else 0 for star in df.stars]

In [27]:
# SA_clf = LogisticRegression()
# SA_clf.fit(X, y)
# del(df_ipads, df_iphones, df_laptops, df_macs, df, X, y)

In [28]:
# joblib.dump(SA_clf, './sentiment_analysis/SA_classifier.pkl')
# joblib.dump(SA_vectorizer, './sentiment_analysis/SA_vectorizer.pkl')

In [29]:
SA_clf = joblib.load('./sentiment_analysis/SA_classifier.pkl')
SA_vectorizer = joblib.load('./sentiment_analysis/SA_vectorizer.pkl')

def get_sentiment(text, preprocess=False):
    if preprocess:
        text = stem(text)
    return SA_clf.predict(SA_vectorizer.transform([' '.join(text)]))[0]

# get_sentiment('This iphone is bad', preprocess=)

**Выделение аттрибутов устройств**

In [30]:
with open('product_attributes_taxonomy.txt', 'r', encoding='utf-8') as f:
    taxonomy_text = f.read()

In [31]:
import json
import pickle
from pprint import pprint as pp
from collections import defaultdict as dd, OrderedDict as od
import re

taxonomy = od()
prev_tab_level = 0
parent = ''
prev_line = ''
path = []
for line in taxonomy_text.split('\n'):
    line = line.strip(' ')
    tab_level = line.count('\t')
    if tab_level > prev_tab_level:
        path.append(parent)    
        parent = prev_line.split()[0]
    elif tab_level < prev_tab_level:
        parent = path[tab_level - prev_tab_level]
        path = path[:tab_level - prev_tab_level]
    for word in line.strip('\t').split(' '):
        if not tab_level == 0:
            taxonomy[tuple(word.lower().split('-'))] = path[1:] + [parent]
        else:
            taxonomy[tuple(word.lower().split('-'))] = [line.split()[0]]
    prev_line = line
    prev_tab_level = tab_level

pp((taxonomy))

with open('product_attributes_taxonomy.pickle', 'wb') as f:
    f.write(pickle.dumps(taxonomy))


OrderedDict([(('screen',), ['SCREEN']),
             (('display',), ['SCREEN']),
             (('type',), ['SCREEN', 'DISPLAY']),
             (('led',), ['SCREEN', 'DISPLAY', 'TYPE']),
             (('oled',), ['SCREEN', 'DISPLAY', 'TYPE', 'LED']),
             (('lcd',), ['SCREEN', 'DISPLAY', 'TYPE']),
             (('tft',), ['SCREEN', 'DISPLAY', 'TYPE', 'LCD']),
             (('ips',), ['SCREEN', 'DISPLAY', 'TYPE', 'LCD']),
             (('resolution',), ['SCREEN', 'DISPLAY']),
             (('960x640',), ['SCREEN', 'DISPLAY', 'RESOLUTION']),
             (('1136x640',), ['SCREEN', 'DISPLAY', 'RESOLUTION']),
             (('1335x750',), ['SCREEN', 'DISPLAY', 'RESOLUTION']),
             (('1920x1080',), ['SCREEN', 'DISPLAY', 'RESOLUTION']),
             (('1048x768',), ['SCREEN', 'DISPLAY', 'RESOLUTION']),
             (('2048x1535',), ['SCREEN', 'DISPLAY', 'RESOLUTION']),
             (('inch', 'size'), ['SCREEN', 'DISPLAY']),
             (('3,5',), ['SCREEN', 'DISPLAY', 'INCH-SI

In [32]:
def make_regex(taxonomy_dict):
    regex = ''
    for entry in sorted(taxonomy_dict.keys(), key=lambda x: (x[0], -len(x))):
        regex += '[ -]+'.join(entry) + '|'
    return re.compile(regex.strip('|'))

product_feature_regex = make_regex(taxonomy)
product_feature_regex.findall('The battery life and the picture quality of the camera sensor as well as the display ppi')

['battery life', 'picture quality', 'camera', 'sensor', 'display', 'ppi']

In [33]:
def get_taxonomy_paths(text, regex, taxonomy):
    taxonomy_paths = []
    search_results = regex.findall(text)
    for sr in search_results:
        taxonomy_paths.append(taxonomy[tuple(re.split('[- ]+', sr))])
    return taxonomy_paths

In [34]:
def get_apple_features(text):
    if type(text) == list:
        text = ' '.join(text)
    return get_taxonomy_paths(text, product_feature_regex, taxonomy)

**Собираем все вместе**

In [194]:
mcr = pd.read_csv('./DATA/MacRummors_with_comments.csv')
nfm = read_data('./DATA/NineToFiveMac.xlsx')
app = read_data('./DATA/AppleInsider.xlsx')
all_data = pd.concat([mcr, nfm ,app])
all_data.datetime = parse_dates(list(all_data.datetime))
all_data = all_data.sort_values('datetime')
# del(mcr, nfm, app)
sample = all_data.fillna('').loc[(datetime.date(2012, 5, 1) < all_data.datetime) & (all_data.datetime < datetime.date(2012, 12, 31))]
sample

Unnamed: 0,author,body,comments,datetime,title
3284,Eric Slivka,The Next Web points to a recent thread in the ...,And this is good for users how?|||This sucks. ...,2012-05-02,iOS Apps with Dropbox Integration Running Afou...
3285,Jordan Golson,Where's that one guy that creates an App that ...,I wonder why everyone is so eager to copy Drop...,2012-05-02,Amazon Releases Mac App to Access Cloud Drive ...
3286,Eric Slivka,Network World shares a rather unique internal ...,"I'm glad I grew up in the 90's, the 80's sound...",2012-05-02,Apple's 1984 Internal Inspirational Video with...
5273,Jordan Kahn,Following Apple’s CEO Tim Cook selling off app...,,2012-05-02,Apple’s iOS chief Scott Forstall cashes in sha...
5272,Élyse Betters,Amazon just launched its Cloud Drive app for M...,,2012-05-02,Amazon releases Cloud Drive desktop app for Ma...
3287,Jordan Golson,question for those who run with their phones:\...,Cool.|||I dont like e-ink for reading. Gray o...,2012-05-02,RunKeeper Is Pebble's First Third-Party Partner
3288,Eric Slivka,With many hoping that Apple will undertake a s...,That's a shame. The full interview is interest...,2012-05-02,Apple Unlikely to Use Liquidmetal Alloys as Ma...
3289,Jordan Golson,Just when you think EA couldn't sink any lower.,Just when you think EA couldn't sink any lower...,2012-05-02,"EA Killing Rock Band for iPhone, Game Will Be ..."
3283,Eric Slivka,"\r\nStreaming music service Spotify, which ga...",Great work by @tobiasahlin and the team.|||Nic...,2012-05-02,Spotify's Long-Awaited iPad App Debuts in App ...
8531,AppleInsider,A new analysis claims the iPhone made up 30.7 ...,,2012-05-02,Apple&#039;s iPhone climbs to 31% share of US ...


In [195]:
sample_leaks = sample[sample['body'].map(lambda x: bool(is_leak(x, preprocess=True)))]

In [196]:
sample_leaks.shape

(585, 5)

In [197]:
titles_vectorizer = TfidfVectorizer(max_df=0.6)
titles_tf_df = titles_vectorizer.fit_transform(sample_leaks.title)
# titles_tf_df = titles_vectorizer.fit(all_data.fillna(''), title)

from sklearn.metrics.pairwise import cosine_similarity
class Event:
    def __init__(self, title, comments, start_date):
        self.start_date =  start_date
        self.end_date = self.start_date
        self.titles = [title]
        self.stemmed_titles = [' '.join(stem(title))]
        self.features = defaultdict(lambda: defaultdict(int))
        self.overall_sentiment = defaultdict(int)
        self.analyze_comments(comments.split('|||'))     
        
    def is_similar(self, current_title, date):
        if abs((date - self.start_date).days) <= 10:
            current_title = ' '.join(current_title.split())
            current_stemmed = [' '.join(stem(current_title))]
            X = titles_vectorizer.transform(current_stemmed)
            y = titles_vectorizer.transform(self.stemmed_titles)
            similarity = cosine_similarity(X, y)
            if np.mean(similarity) >= 0.20:
                return np.mean(similarity)
        
        
    def add_if_similar(self, current_title, comments, date):
        if abs((date - self.start_date).days) <= 10:
            current_title = ' '.join(current_title.split())
            current_stemmed = [' '.join(stem(current_title))]
            X = titles_vectorizer.transform(current_stemmed)
            y = titles_vectorizer.transform(self.stemmed_titles)
            similarity = cosine_similarity(X, y)
            if np.mean(similarity) >= 0.20:
                self.stemmed_titles.append(current_stemmed[0])
                self.titles.append(current_title)
                self.end_date = date
                self.analyze_comments(comments.split('|||'))
                return True
            return False
        return False
    
    def get_features_sentiment(self, preprocessed_text):
        for sentence in list(chain(*preprocessed_text)):
            sentiment = get_sentiment(' '.join(sentence), preprocess=True)
            features = get_apple_features(' '.join(sentence))
            for feature in features:
                self.features[feature[0]][sentiment] += 1
                self.overall_sentiment[sentiment] += 1
                
    def analyze_comments(self, comments):
        for comment in comments[:20]:
            comment_preprocessed = preprocess_text(comment, split='paragraph_sentence')
            self.get_features_sentiment(comment_preprocessed)
                
# ev = Event('Iphone launch', 'iPhone camera is great|||It is dull|||I hate thew battery', datetime.date(2009, 1, 1))
# ev.add_if_similar('Iphone launched new', 'iPhone camera is dull|||It is good|||I love thew battery', datetime.date(2010, 1, 1))
# ev.features

In [324]:
import datetime

product_data = defaultdict(lambda: defaultdict(lambda: defaultdict()))
for i, row in log_progress(sample_leaks.iterrows(), every=1):
#     sents = preprocess_text(row.body, split='sentence')
    sents = preprocess_text(row.title, split='sentence')
    print(row.title, row.datetime)
    lpm, upm = find_apple_products(sents, row.datetime)
    print(upm)
    for product in get_unique_mentions(upm):
        if not '[Updated]' in row.title:
            is_added = False
            similarity_list = {}
            for title, event in product_data[product].items():
                if title != row.title:
                    similarity = event.is_similar(row.title, row.datetime)
                    if similarity:
                        similarity_list[title] = similarity
            if similarity_list:
                best_match = max(similarity_list.items(), key=lambda x: x[1])[0]
                is_added = product_data[product][best_match].add_if_similar(row.title, row.comments, row.datetime)
            if not is_added:
                product_data[product][row.title] = Event(row.title, row.comments, row.datetime)

Apple Unlikely to Use Liquidmetal Alloys as Major Design Material for Several Years 2012-05-02
[]
Purported next-gen iPhone SIM tray appears similar to current design 2012-05-02
[['iPhone 5', datetime.date(2012, 9, 21), <_sre.SRE_Match object; span=(10, 25), match='next-gen iphone'>]]
Energizer Introduces New Wrap-Around iPhone Chargers 2012-05-03
[]
More Claims of Taller, Thinner Next-Generation iPhone with 4-Inch Screen and New Dock Connector 2012-05-03
[['iPhone 5', datetime.date(2012, 9, 21), <_sre.SRE_Match object; span=(22, 44), match='next-generation iphone'>]]
Rumor: Apple&#039;s next iPhone will be 2mm thinner with 4&quot; screen, metal back 2012-05-03
[['iPhone 5', datetime.date(2012, 9, 21), <_sre.SRE_Match object; span=(16, 27), match='next iphone'>]]
Touch panel shipments for Apple&#039;s iPhone expected to drop 15-20% in Q2 2012 2012-05-04
[]
Apple files claim to obtain iphone5.com domain 2012-05-06
[]
Apple Seeks to Gain Control of iPhone5.com Domain 2012-05-06
[]
Apple 

In [203]:
for product, events in product_data.items():
    if 'iphone' in product.lower():
        print('\n================================================\n')
        print(product)
        for title, event in events.items():
            if len(event.titles) >= 2:
                print(event.start_date, event.end_date)
                pp(event.titles)
                pp(event.features)
                print('----------------------------------------------------\n')



iPhone 5S


iPhone 5
2012-08-06 2012-08-12
['Claimed SIM Tray for Next-Generation iPhone Points to Nano-SIM Standard',
 'New Photos of Claimed Next-Generation iPhone Parts Include Display Shield',
 'New photos of rumored next-gen iPhone display shielding surface online',
 'Alleged next-gen iPhone logic board revealed in new photos',
 'Photos of alleged next-generation iPhone motherboard surface, point to new '
 'antennas, battery']
defaultdict(<function Event.__init__.<locals>.<lambda> at 0x00000000196221E0>,
            {'CAMERA': defaultdict(<class 'int'>, {0: 1, 1: 2}),
             'DESIGN': defaultdict(<class 'int'>, {0: 1}),
             'SCREEN': defaultdict(<class 'int'>, {0: 2, 1: 4}),
             'SOFTWARE': defaultdict(<class 'int'>, {0: 3, 1: 10}),
             'SPECS': defaultdict(<class 'int'>, {0: 3, 1: 12})})
----------------------------------------------------

2012-07-16 2012-07-16
['Purported next-gen iPhone front panel has centered FaceTime camera',
 'Claimed Fro

In [363]:
colors = ["lightgreen", "Blue", "BlueViolet", "Brown", "BurlyWood", "CadetBlue", "Chartreuse", "Chocolate", "Coral", "CornflowerBlue", "Cornsilk", "Crimson", "Cyan", "DarkBlue", "DarkCyan", "DarkGoldenRod", "DarkGray", "DarkGrey", "DarkGreen", "DarkKhaki", "DarkMagenta", "DarkOliveGreen", "DarkOrange", "DarkOrchid", "DarkRed", "DarkSalmon", "DarkSeaGreen", "DarkSlateBlue", "DarkSlateGray", "DarkSlateGrey", "DarkTurquoise", "DarkViolet", "DeepPink", "DeepSkyBlue", "DimGray", "DimGrey", "DodgerBlue", "FireBrick", "FloralWhite", "ForestGreen", "Fuchsia", "Gainsboro", "GhostWhite", "Gold", "GoldenRod", "Gray"]

In [42]:
import os
print(os.environ.get('QT_API'))

None


In [399]:
import matplotlib.pyplot as plt
# ['cairo', 'WebAgg', 'nbAgg', 'GTKCairo', 'WX', 'GTKAgg', 'Qt4Agg', 'Qt5Agg', 'emf', 
# 'ps', 'svg', 'CocoaAgg', 'pdf', 'TkAgg', 'GTK', 'gdk', 'WXAgg', 'pgf', 'GTK3Agg', 'GTK3Cairo', 'agg', 'MacOSX', 'template']
plt.switch_backend('Qt5Agg')
# %matplotlib qt
from scipy.interpolate import spline
from matplotlib.lines import Line2D
from itertools import product
import matplotlib.font_manager as font_manager
import matplotlib.dates
from matplotlib.dates import MONTHLY, WEEKLY, DAILY, DateFormatter, rrulewrapper, RRuleLocator
from pprint import pformat as pf
    
def visualize_events(product_data):
    fig = plt.figure(figsize=(22, 11))
    ax = fig.add_subplot(111)
    pos = []
    leg_colors = []
    legend = []
    topics = []
    id = 1
    perv_n = 0
    sc = {0: 'red', 1: 'blue'}
    shift = 0
    first = 'Negative sentiments percent'
    
#     product_data = sorted(product_data.items(), key=lambda x: (x[0], x[1].start_date, -x[1].end_date))
    for i, product in enumerate(product_data):
        if 'iphone' in product.lower():
            if product_data[product]:
                for topic, event in sorted(product_data[product].items(), key=lambda x: (x[1].start_date, x[1].end_date)):
                   
                    if len(event.titles) >= 2 and event.features and re.search("launch|event|keynote", topic.lower()) is None:
                        print(re.search("launch|event|keynote", topic.lower()), topic)
#                         (datetime.date(2012, 9, 1) <= event.start_date <= datetime.date(2012, 12, 1)) and   
                        if product not in legend:
                            leg = product
                            legend.append(product)

                        n_features = len(event.features)                    
#                         y_position = id + shift + n_features * 0.2
                        y_position = id + 2
                        shift += n_features * 0.4
                        base_width = (event.end_date - event.start_date).days + 1
                        if event.features:
                            percent_bad = event.overall_sentiment[0] / (event.overall_sentiment[0] +  event.overall_sentiment[1])
#                             print(percent_bad)
                            ax.barh(bottom= y_position, 
                                    width=base_width * percent_bad, 
                                    left=event.start_date, 
#                                     height=n_features * 0.4, 
                                    height= 0.7, 
                                    align='center', 
    #                                 edgecolor = 'green', # colors[i].lower(), 
                                    color  = 'red', 
#                                     linewidth = 3,
                                    label = first,
                                    alpha = 1)
                            first =  None    
                
                
                        ax.barh(bottom= y_position, 
                                width = base_width, 
                                left=event.start_date, 
#                                 height=n_features * 0.4,
                                height= 0.7,
                                
                                align='center', 
                                color= colors[i].lower(), 
#                                 color  = 'lightgreen', 
#                                 linewidth = 3,
                                alpha = 0.5, 
                                label = leg)
                        ax.text(event.start_date, y_position + 0.15, topic.lower() + '  ', fontsize=10, clip_on=True, ha='right')
                        
                            

                        f_id = 0
#                         for feature, sentiments in event.features.items():
#                             y_pos = y_position - (n_features * 0.2) + 0.1 + f_id * 0.4
#                             ax.text(event.start_date, y_pos, feature.lower(), fontsize=10, clip_on=True)  #TEXT
#                             f_id += 1
#                             for o, (sentiment, count) in enumerate(sentiments.items()): 
#                                 summed = sum(list(sentiments.values()))
#                                 ax.barh(bottom= y_pos - 0.1 + o * 0.1, #FEATURES
#                                         width=((event.end_date - event.start_date).days + 1)/2 * count / summed, 
#                                         left= event.start_date, 
#                                         height= 0.05, 
#                                         align='center', 
#                                         color= sc[sentiment], 
#                                         alpha = 0.3)


        #                 ax.annotate('local max', xy=(event.start_date, (id*0.5)+1.0), xytext=(3, 1.5),arrowprops=dict(facecolor='black', shrink=0.05), clip_on=True)
                        pos.append(y_position)
                        topics.append(re.sub("&#(\d+);", lambda m: chr(int(m.group(1))), topic))
                        id+=1
                        leg = None
                        perv_n = n_features
                
    locsy, labelsy = plt.yticks(pos, topics)
    plt.setp(labelsy, fontsize = 14)

    # Format the x-axis

    ax.axis('tight')
    ax.grid(color = 'g', linestyle = ':')

    ax.xaxis_date() #Tell matplotlib that these are dates...

    rule = rrulewrapper(WEEKLY
                        , interval=1)
#     rule = rrulewrapper(MONTHLY, interval=1)
    loc = RRuleLocator(rule)
    formatter = DateFormatter("%d-%m-%y")
#     formatter = DateFormatter("%b-%y")

    ax.xaxis.set_major_locator(loc)
    ax.xaxis.set_major_formatter(formatter)
    
    labelsx = ax.get_xticklabels()
    plt.setp(labelsx, rotation=30, fontsize=16)

    # Format the legend
                
    ax.invert_yaxis()
    ax.legend(loc='upper right',  prop={'size':12})
    fig.autofmt_xdate()
#     plt.savefig('gantt.png')
    plt.tight_layout()
    plt.show()
#    
    
visualize_events(product_data)

None More Claims of Taller, Thinner Next-Generation iPhone with 4-Inch Screen and New Dock Connector
None 'iPhone 5' Headphone Jack and Earpiece Component Surfaces
None Apple Ordering Screens of 'At Least 4 Inches' for Next-Generation iPhone
None Steve Jobs worked closely on next iPhone with larger screen, says Bloomberg
None Parts show alleged next-gen iPhone cameras, 4.1&quot; iPod touch front panel
None Claimed Rear Shell with Sides for Next-Generation iPhone Surfaces [Updated x2]
None New Photos of Next-Generation iPhone 'Engineering Sample' Massing
None Purported next-gen iPhone front panel has centered FaceTime camera
None WSJ reaffirms Apple&#039;s next iPhone will feature thinner in-cell touchscreen
None Next-Generation iPhone with LTE, NFC, and 1 GB RAM Reportedly Still in Engineering Testing
None Report: Apple to sell 30-pin adapter for new iPhone’s smaller 19-pin dock connector
None Photos of alleged fully-assembled next-gen iPhone surface
None Sharp President Confirms Shipm