In [None]:
import datetime
import gensim
from gensim.matutils import argsort
from gensim.models.coherencemodel import CoherenceModel
from gensim.topic_coherence import segmentation
from gensim.topic_coherence import probability_estimation
from gensim.topic_coherence import direct_confirmation_measure
from gensim.topic_coherence.aggregation import arithmetic_mean

import numpy as np

from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

from yappyChuck import Client

client = Client()

Read the data in here. 

In [None]:
df = pd.read_csv('../enmi.csv')

Set up the stop list and vectorise the data

In [None]:
stoplist = stopwords.words('english')
stoplist += ['enmi18']
stoplist += ['rt', 'dm', 'ht']#remove Twitter specific language
stoplist += stopwords.words('french')
vect = CountVectorizer(stop_words=stoplist, 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')

Create the corpus from the vector

In [None]:
X = vect.fit_transform(df.text)
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

Now to create the LDA model

In [None]:
number_of_topics = 5
model = gensim.models.LdaModel(corpus, id2word = dict((v, k) for k, v in vect.vocabulary_.items()), 
                               num_topics=number_of_topics)

### Create the coherence

Now to set up the coherence parts

In [None]:
#getting the topics out is the key to this
# toppn is set to 20 by default. Appears to be a key in getting closer to 0.
topics = [argsort(topic, topn=5, reverse=True) for topic in model.get_topics()]
topics

In [None]:
segmented_topics = segmentation.s_one_pre(topics)

Set up the probability estimations against the 

In [None]:
result = probability_estimation.p_boolean_document(corpus, segmented_topics)

Let's sonify the probabilities here. 

In [None]:
#each of these is an index in t above. 
import time
import math

def cent_to_note(num_cent, base_freq):
    '''
       Function to return a frequency from a base frequency and number of cents
    '''
    return base_freq * math.pow(2,(num_cent/1200))

def freq_to_cent(current_freq, next_freq):
    '''
      Function to calculate a cent between two frequencies
    '''
    return np.dot(1200, math.log2((next_freq/current_freq)))

#iterate over the 
for k,v in result.index_to_dict().items():
    #convert the index to a float and send the new frequency
    #need to handle if > 1000 - the frequency is bad. 
    msg = "spectral:{0}:{1}".format(float(k),len(v))
    client.send(msg)
    time.sleep(1)

In [None]:
avg = direct_confirmation_measure.log_conditional_probability(segmented_topics, result)
avg

In [None]:
arithmetic_mean(avg)