In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from scipy.stats import entropy

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
text_col = 'Excerpt Copy'

df = pd.read_csv('data/coder1_all.tsv', sep='\t')
df = df[['uni', 'Participant', 'Excerpt Copy', 'rank', 'identity',
       'Q3-g', 'Q3-l', 'Q3-b', 'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq',
       'Q4-t', 'Q4-i', 'Q4-f', 'Q4-m']]

print(df.shape[0])

df.head()

In [None]:
df = df.replace({'Question: Q\d*\w?; Answer:': ''}, regex=True)
df = df.replace({'Question: Q\d*-other; Answer:': ''}, regex=True)

def unlist(x):
    return x[0]

text = df[['uni', 'Participant', 'Excerpt Copy']].groupby(['uni', 'Participant'])
text = text.agg(lambda t: "%s" % ' '.join(t))
text = text['Excerpt Copy']
print(text.shape[0])
text.head()

In [None]:
n_topics = 10
n_snow = 10

documents = text.values

In [None]:
text.values[0]

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        words = " ".join([feature_names[i] 
                          for i in topic.argsort()[:-no_top_words - 1:-1]])
        print("Topic", topic_idx, ":  ", words)

def JSD(P, Q):
    _P = P / np.linalg.norm(P, ord=1)
    _Q = Q / np.linalg.norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

def list_sims(df):
    n = df.shape[0]
    result = []
    
    for i in range(0,n):
        for j in range(i+1,n):
            tmp = {'i': i, 'j': j, 'jsd': JSD(df.loc[i], df.loc[j])}
            result.append(tmp)
    
    return pd.DataFrame(result)
    
def worker(documents, method='NMF', n_topics=10, calc_edges=True):   
    if method == 'NMF':
        vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                                       max_features=1000, 
                                       stop_words='english')
        mod = NMF(n_components=n_topics, 
                      alpha=.1, 
                      l1_ratio=.5, 
                      init='nndsvd')
        
    elif method == 'LDA':
        vectorizer = CountVectorizer(max_df=0.95, min_df=2, 
                                    max_features=1000, 
                                    stop_words='english')
        mod = LatentDirichletAllocation(n_components=n_topics, 
                                    max_iter=20, 
                                    learning_method='online', 
                                    n_jobs=-1 )

    transformed = vectorizer.fit_transform(documents)
    feat_names = vectorizer.get_feature_names()
    model = mod.fit(transformed)
    
    display_topics(model, feat_names, n_snow)
    
    edges = None
    if calc_edges:
        edges = list_sims(transformed)
    
    return edges

In [None]:
person = text.values

In [None]:
edges = {}
edges['nmf_person'] = worker(person, 'NMF')
edges['lda_person'] = worker(person, 'LDA')

In [None]:
edges['nmf_person'].jsd.hist(bins=20)

In [None]:
edges['lda_person'].jsd.hist(bins=20)

In [None]:
for i in [3, 5, 8, 10, 15]:
    print("\n\nNMF", i)
    worker(person, 'NMF', n_topics=i, calc_edges=False)

In [None]:
for i in [3, 5, 8, 10, 15]:
    print("\n\nLDA:", i)
    worker(person, 'LDA', n_topics=i, calc_edges=False)

In [None]:
tmp.to_csv('data/cosine_people.tsv', sep='\t')
text=text[['uni', 'Participant']]
text.to_csv('data/cosine_people_ids.tsv', sep='\t')

In [None]:
text.head()

In [None]:
TfidfVectorizer?

In [None]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import regexp_tokenize 

stemmer = SnowballStemmer("english") 

def my_tokenizer(text):
    out = []
    for w in regexp_tokenize(text, '\w+'):
        out.append(stemmer.stem(w))
    return out

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, 
                max_features=1000, 
                stop_words='english',
                tokenizer=my_tokenizer
               ).fit(text.values)

In [None]:
mod = NMF(n_components=15, 
              alpha=.1, 
              l1_ratio=.5, 
              init='nndsvd')

transformed = vectorizer.fit_transform(text.values)
feat_names = vectorizer.get_feature_names()
model = mod.fit(transformed)

display_topics(model, feat_names, n_snow)

In [None]:
worker(person, 'NMF', n_topics=15, calc_edges=False)