In [1]:
import numpy as np
import nltk
import re
from time import time
from gensim import corpora, models, similarities
from itertools import chain
from _datetime import datetime
from lxml import etree
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from math import log
from pprint import pprint as pp
from string import punctuation
from collections import defaultdict, deque
import pandas as pd
import numpy



In [2]:
numbers = re.compile('[0-9]')
punctuation += '\n—–- «»\'\"'

In [3]:
def read_data(filename):
    """Reads the given file and creates a generator object returning one line at a time split by tabulation"""
    data = pd.read_excel(filename)
    return data

In [4]:
with open('.\expectations_meter\english_stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = [word.strip() for word in f.readlines()]

lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    return [lemmatizer.lemmatize(word).lower() for word in nltk.word_tokenize(text) 
            if word not in stopwords + list(punctuation) and numbers.search(word) is None
            and word not in ["''", ' ', '``', '', "'s"]
            ]

def preprocess(text):
    # sentences = nltk.sent_tokenize(text)
    tokens = lemmatize(text.replace('\n', ' '))
    # [tokens.extend(lemmatize(sent)) for sent in sentences]
    return tokens

In [5]:
def get_corpus(filename):
    # data = read_data(filename).iloc[:1000]
    data = read_data(filename)
    dates = data['datetime']
    # bodies = [preprocess(text) for text in list(data['body']) if not pd.isnull(text)]
    # titles = [preprocess(text) for text in list(data['title']) if not pd.isnull(text)]
    bodies = []
    for index, (title, body, author, datetime) in data.iterrows():
        if not pd.isnull(body):
            bodies.append(preprocess(body))
    return bodies, dates

In [29]:
def get_dates(filename):
    data = read_data(filename)
    dates = data['datetime']
    return dates

In [56]:
macrumors_dates, macrumors_titles, macrumors_bodies = get_corpus('.\expectations_meter\DATA\MacRummors.xlsx')
# appleinsider_data = get_corpus('.\expectations_meter\DATA\AppleInsider.xlsx')
# nineto5mac_data = get_corpus('.\expectations_meter\DATA\\NineToFiveMac.xlsx')

In [6]:
def perplexity(model, corpus):
    """Calculates perpelexity of the given model on a given corpus of bow texts"""
    corpus_length = 0
    log_likelihood = 0
    topic_profiles = model.state.get_lambda() / np.sum(model.state.get_lambda(), axis=1)[:, np.newaxis]
    for document in corpus:
        gamma, _ = model.inference([document])
        document_profile = gamma / np.sum(gamma)
        for term_id, term_count in document:
            corpus_length += term_count
            term_probability = np.dot(document_profile, topic_profiles[:, term_id])
            log_likelihood += term_count * log(term_probability)
    perplexity = np.exp(-log_likelihood / corpus_length)
    return perplexity

In [7]:
def make_dictionary_bow(texts):
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=None)
    print(dictionary)
    bow_texts = [dictionary.doc2bow(text) for text in texts]
    return  dictionary, bow_texts
    
def make_model(dictionary, bow_texts):
    """
    Reads the file with the given filename and creates training data, than trains the lda model and returns
    the model itself, array with dates and array with bow texts 
    """
   
    start = datetime.now()
    model = models.ldamodel.LdaModel(bow_texts, 
                                     id2word=dictionary, 
                                     num_topics=400, 
                                     chunksize=500, 
                                     update_every=5, 
                                     passes=10)
    print('Evaluation time: {}'.format((datetime.now() - start) / 60))
    print('Perplexity: {}'.format(perplexity(model, bow_texts)))
    return model

In [8]:
def count_topics_by_date(model, dates, bow_texts, threshold):
    """
    Counts the number of times each topic from a given model occurred in each of the bow_texts with corresponding 
    dates from the dates array.
    """
    counts = defaultdict(lambda: defaultdict(int))
    for date, document in zip(dates[1:], bow_texts[1:]):
        if not len(document) == 0:
            topics = model.get_document_topics(document, minimum_probability=threshold)
            for topic, prob in topics:
                counts[date][topic] += 1
    return counts

In [9]:
def count_topics(counts_by_date):
    """Counts the overall distribution of topics"""
    counts = defaultdict(int)
    for date, topics in counts_by_date.items():
        for topic, count in topics.items():
            counts[topic] += count
    return sorted(counts.items(), key=lambda x: -x[1])

In [12]:
# model, dates, titles, bow_texts = get_data_n_model(macrumors_dates, macrumors_titles, macrumors_bodies)
# date_counts1 = count_topics_by_date(model, dates, bow_texts, 0.04)
# sorted_counts1 = count_topics(date_counts)

# model2, dates2, titles2, bow_texts2 = get_data_n_model(macrumors_dates, macrumors_titles, macrumors_bodies)
# date_counts2 = count_topics_by_date(model2, dates2, bow_texts2, 0.04)
# sorted_counts2 = count_topics(date_counts2)
app_ins_bodies, app_ins_dates = get_corpus('.\expectations_meter\DATA\AppleInsider.xlsx')
app_ins_dict, app_ins_bow = make_dictionary_bow(app_ins_bodies)
app_ins_model = make_model(app_ins_dict, app_ins_bow)
app_ins_date_counts = count_topics_by_date(app_ins_model, app_ins_dates, app_ins_bow, 0.04)
app_ins_sorted_counts= count_topics(app_ins_date_counts)

nfm_bodies, nfm_dates = get_corpus('.\expectations_meter\DATA\\NineToFiveMac.xlsx')
nfm_dict, nfm_bow = make_dictionary_bow(nfm_bodies)
nfm_model = make_model(nfm_dict, nfm_bow)
nfm_date_counts = count_topics_by_date(nfm_model, nfm_dates, nfm_bow, 0.04)
nfm_sorted_counts= count_topics(nfm_date_counts)


# model2, dates2, bow_texts2 = get_data_n_model('/home/dmitri/SAS/myfolders/task/second/2_prodengi.csv')

Dictionary(11007 unique tokens: ['term', 'incurring', 'joy', 'alter', 'fight']...)


Evaluation time: 0:01:50.063612


Perplexity: 870.1238689470717


Dictionary(6526 unique tokens: ['term', 'package', 'index', 'joy', 'alter']...)


Evaluation time: 0:00:51.515913


Perplexity: 783.0925089464627


In [109]:
# model2.save('400_topics.model')
import json
with open('bow_texts_macrummors.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(bow_texts2, ensure_ascii=False))

In [14]:
def write_to_file(filename, topics_terms):
    with open(filename, 'w', encoding='utf-8') as f:
        for topic, count, terms in topics_terms:
            f.write("<{}> --> {}: [{}]\n".format(topic, count, terms))

In [24]:
def load_bow(filename):
    with open(filename, 'r', encoding='utf-8') as f:
         return json.loads(f.read())

In [31]:
app_ins_model = models.LdaModel.load('.\\expectations_meter\\topic_models\\models\\app_ins_400_model.model')
app_ins_dict = corpora.Dictionary.load('.\\expectations_meter\\topic_models\\dicts\\app_ins_dict.dict')
app_ins_bow = load_bow('.\\expectations_meter\\topic_models\\bows\\app_ins_bow.json')

In [32]:
app_ins_dates = get_dates('.\expectations_meter\DATA\AppleInsider.xlsx')

In [35]:
app_ins_date_counts = count_topics_by_date(app_ins_model, app_ins_dates, app_ins_bow, 0.04)
# app_ins_sorted_counts= count_topics(app_ins_date_counts)

In [28]:
def topics_words(model, sorted_counts, topics_number, words_number):
    terms = []
    for topic, count in sorted_counts[:topics_number]:
        most_frequent_words = model.show_topic(topic, words_number)
        terms.append([topic, count, ', '.join([word for word, prob in most_frequent_words])])
    return terms

In [18]:
terms = topics_words(app_ins_model, app_ins_sorted_counts, 150, 10)
# write_to_file('app_ins_model_400_topics.txt', terms)
pp(terms, width=500)
# pp(best_topics1, width=500)
# pandas.DataFrame(best_topics1)

[[360, 4723, 'report, would, source, could, reportedly, been, claim, rumor, according, claimed'],
 [270, 3423, "n't, do, can, doe, it, 're, like, one, make, work"],
 [208, 3041, 'app, store, ios, user, free, developer, download, can, apps, available'],
 [85, 2994, "cook, i, tim, ceo, we, about, do, n't, executive, people"],
 [378, 2503, 'ios, device, feature, user, system, ipad, mobile, release, running, operating'],
 [18, 2147, 'could, would, year, likely, current, next, belief, may, model, factor'],
 [254, 2071, 'launch, day, week, friday, first, sale, availability, store, preorders, sold'],
 [229, 2048, 'support, version, update, updated, feature, addition, improvement, latest, added, add'],
 [212, 2043, 'next-generation, expected, rumored, part, device, component, handset, rumor, could, current'],
 [249, 1881, 'production, supply, report, supplier, chain, order, expected, launch, source, digitimes'],
 [219, 1868, 'sale, million, quarter, percent, unit, year, market, shipment, sold,