In [2]:
import os.path # Directories, gotta have em.
import pickle # Better than .csv + a cute name.
import pandas as pd # DataFrames. Looping is easy. Obscure functions are fast.
import numpy as np # do math on vectors with more obscure functions.
import re # one-way encryption for your codebase
import pathlib
from scipy import stats

# Interactive Computing
from timeit import default_timer as tm
from tqdm import tqdm

# Spacy
import spacy
from spacy.tokens import Doc, DocBin
nlp = spacy.load('en_core_web_trf')
if Doc.has_extension('name'):
    pass
else:
    Doc.set_extension('name', default=None)
    
# LDA
from gensim.models import Phrases
from gensim.models import LdaModel # just for loading saved models
from gensim.models import LdaMulticore # for computing topic models
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

# Visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
persuade_corpus = pd.read_pickle('persuade_corpus_full.pkl')
display(persuade_corpus)

Unnamed: 0,essay_id_comp,full_text,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,hierarchical_id,hierarchical_text,hierarchical_label
0,423A1CA112E2,Phones\n\nModern humans today are always on th...,1.622628e+12,0.0,7.0,Phones\n\n,Unannotated,Unannotated 1,,,
1,423A1CA112E2,Phones\n\nModern humans today are always on th...,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,,,
2,423A1CA112E2,Phones\n\nModern humans today are always on th...,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,,,
3,423A1CA112E2,Phones\n\nModern humans today are always on th...,1.622628e+12,313.0,400.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,1.622628e+12,They are some really bad consequences when stu...,Position
4,423A1CA112E2,Phones\n\nModern humans today are always on th...,1.622628e+12,402.0,757.0,"When people have phones, they know about certa...",Evidence,Evidence 2,1.622628e+12,They are some really bad consequences when stu...,Position
...,...,...,...,...,...,...,...,...,...,...,...
285378,DF920E0A7337,Have you ever asked more than one person for h...,1.617757e+12,1624.0,2398.0,"One person can change your option, but it may ...",Evidence,Evidence 2,1.617757e+12,"it can change your perspective of a topic,",Claim
285379,DF920E0A7337,Have you ever asked more than one person for h...,1.617757e+12,2399.0,2454.0,"\nFinally, it informs you about what other peo...",Unannotated,Unannotated 5,,,
285380,DF920E0A7337,Have you ever asked more than one person for h...,1.617757e+12,2455.0,3266.0,Having more than one person's opinion might le...,Evidence,Evidence 3,1.617757e+12,it informs you about what other people enjoy.,Claim
285381,DF920E0A7337,Have you ever asked more than one person for h...,1.617757e+12,3267.0,3281.0,"\nIn conclusion,",Unannotated,Unannotated 6,,,


In [4]:
display(persuade_corpus.discourse_type.value_counts().index)

Index(['Claim', 'Evidence', 'Unannotated', 'Position', 'Concluding Statement',
       'Lead', 'Counterclaim', 'Rebuttal'],
      dtype='object')

In [5]:
# persuade_corpus['full_text'].str.replace('\n','')

# want to make sure I keep the essay id
# need to make a dict of possible labels with BOOL values. {"claim": False, "evidence":True, etc.}
# result should be a tuple of (name, text, label_dict)
# output should be a collection of spacy docs with ._.name and .cats


# add textcategorizer to pipeline
# add labels

In [6]:
df = persuade_corpus[['essay_id_comp', 'discourse_text', 'discourse_type']].copy()
df['discourse_text'] = df['discourse_text'].str.replace('\n','')
text_id_tuples = list(df[['discourse_text','essay_id_comp']].itertuples(index=False, name=None))

In [7]:
spacy_file = 'persuade.spacy'

def proc_texts(text_tuples):
    doc_bin = DocBin(attrs=["ORTH", "TAG", "HEAD", "DEP", "LEMMA", "MORPH", "POS"], store_user_data=True)
    for doc, name in nlp.pipe(text_tuples, as_tuples=True):
        doc._.trf_data = None
        doc._.name = name
#         doc.cats = {} # I can use this to store categories for spacy text categorization later on.
        doc_bin.add(doc)
    return doc_bin

start = tm()
if os.path.isfile(spacy_file):
    doc_bin = DocBin().from_disk(spacy_file)
else:
    doc_bin = proc_texts(text_id_tuples)
    doc_bin.to_disk(spacy_file)
docs = list(doc_bin.get_docs(nlp.vocab))
print(round(tm()-start,2))

84.19


In [10]:
nlp.Defaults.stop_words = {'the','an', 'a'}
nlp.Defaults.stop_words |= {'car', 'driverless', 'drive', 'mars', 'driver', 'electoral', 'vote', 'president', 'state', 'venus', 'planet', 'earth', 'elector', 'election', 'phone', 'cell', 'technology', 'emotion', 'student', 'project', 'design', 'school', 'community', 'activity'}
print(nlp.Defaults.stop_words)

{'driver', 'emotion', 'driverless', 'the', 'electoral', 'cell', 'election', 'state', 'an', 'vote', 'earth', 'community', 'a', 'venus', 'activity', 'drive', 'technology', 'mars', 'school', 'elector', 'planet', 'phone', 'car', 'president', 'project', 'design', 'student'}
{'driver', 'emotion', 'driverless', 'the', 'electoral', 'cell', 'election', 'state', 'an', 'vote', 'earth', 'community', 'a', 'venus', 'activity', 'drive', 'technology', 'mars', 'school', 'elector', 'planet', 'phone', 'car', 'president', 'project', 'design', 'student'}


In [11]:
def reduce_tokens(docs):
    '''
    Extract alpha tokens
    Lemmatizes and makes lowercase
    '''
    processed_docs = []
    for doc in docs:
        processed_doc = []
        for token in doc:
#             if not token.is_punct and not token.is_stop and not token.is_digit and not token.is_space:
            if token.is_alpha and not token.is_stop:
                processed_doc.append(token.lemma_.lower())
        processed_docs.append(processed_doc)
    return processed_docs

def compute_bigrams(processed_docs):
    '''
    For any bigrams that occur at least 20 times across all docs,
    if that bigram occurs in a doc, 
    add the bigram to the list of tokens in the doc.
    '''
    bigram = Phrases(processed_docs, min_count=20)
    for idx in range(len(processed_docs)):
        for token in bigram[processed_docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                processed_docs[idx].append(token)
    return processed_docs

def dictionary_corpus_processing(procced_docs):
    # Create a dictionary representation of the documents.
    diction = Dictionary(procced_docs)
    print('Number of unique tokens: %d' % len(diction))

    # Filter out words that occur in less than 20 documents, or more than 50% of the documents.
    diction.filter_extremes(no_below=20, no_above=0.5)
    print('Number of unique tokens: %d' % len(diction))

    # Make a bag of words.
    corp = [diction.doc2bow(doc) for doc in procced_docs]
    print('Number of documents: %d' % len(corp))
    
    return corp, diction

start = tm()
processed_docs = compute_bigrams(reduce_tokens(docs))
corpus, dictionary = dictionary_corpus_processing(processed_docs)
display("--- %s seconds ---" % round((tm() - start),2))

Number of unique tokens: 60801
Number of unique tokens: 8394
Number of documents: 285383


'--- 19.39 seconds ---'

In [12]:
def latent_dirichlet_allocation(input_corpus, input_dictionary):
    # Set training parameters.
    chunksize = 500000
    passes = 10
    iterations = 5
    eval_every = None  # Don't evaluate model perplexity, takes too much time.
    
    # Make an index to word dictionary.
    temp = input_dictionary[0]  # This is only to "load" the dictionary.
    id2word = input_dictionary.id2token
    
    # Train LDA model.
    start_time = tm()
    
    model = LdaMulticore(
        corpus=input_corpus,
        id2word=id2word,
        chunksize=chunksize,
#         alpha='auto',
#         eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )

    print("--- %s seconds ---" % (tm() - start_time))
    
    return(model)

num_topics = 7

file_path = os.path.join('gensim_models', '{n}_topics'.format(n=num_topics))
pathlib.Path(file_path).mkdir(parents=True, exist_ok=True) 
model_filename = os.path.join(file_path, 'LDA_persuade_{n}_topics_stops_1.gensim'.format(n = num_topics))

if os.path.isfile(model_filename):
    print('Retrieving Topics from File')
    model = LdaModel.load(model_filename)
    print('Done.')
else:
    print('No model with {n} topics found. Extracting {n} topics.'.format(n=num_topics))
    model = latent_dirichlet_allocation(corpus, dictionary)
    print('Topics Extracted')
    model.save(model_filename)
    p = gensimvis.prepare(model, corpus, dictionary)
    pyLDAvis.save_html(p, 'pyLDAvis_{n}.html'.format(n = num_topics))
    print('Vizualization saved.')

# Get the documentxtopics as tuples
document_topics = model.get_document_topics(corpus, minimum_probability=0, minimum_phi_value=None, per_word_topics=False)

No model with 7 topics found. Extracting 7 topics.
--- 202.0795381180942 seconds ---
Topics Extracted
Vizualization saved.


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [13]:
# Create a label list for topics
topic_cols = [f'topic_{i}' for i in range(1,num_topics+1)]

# Get the topics for each document using list comprehension
topics_by_sample = pd.DataFrame([[y[1] for y in  x] for x in tqdm(document_topics)], columns=topic_cols)

100%|██████████| 285383/285383 [00:26<00:00, 10676.34it/s]


In [14]:
zscore_topics = pd.DataFrame(stats.zscore(topics_by_sample), columns=topic_cols)

# join document data to topic data
full_df = df.join(zscore_topics, how = 'inner')

In [15]:
full_df.groupby(['discourse_type']).agg(['mean']).style.background_gradient()

Unnamed: 0_level_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean
discourse_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Claim,0.047106,0.00405,-0.025404,0.002809,-0.039279,-0.018076,0.011515
Concluding Statement,0.043098,0.018832,0.002723,0.033921,-0.105064,-0.028295,0.005879
Counterclaim,-0.007909,-0.168776,-0.001271,-0.142952,-0.150606,0.169615,0.240718
Evidence,0.049717,0.026138,-0.018208,-0.011862,0.060057,-0.053128,-0.04657
Lead,0.054028,0.109016,-0.034086,-0.098589,-0.016323,-0.101141,0.060325
Position,-0.071244,-0.017708,0.021916,0.151671,-0.101964,-0.026531,0.036329
Rebuttal,-0.05668,-0.144896,0.022194,-0.102912,-0.137442,0.138211,0.23233
Unannotated,-0.149991,-0.02671,0.067868,-0.008232,0.133662,0.120916,-0.071228


So if we wanted to only include words in the topic model that have good prompt distribution... well, we would first need to have prompt labels for the data.
Barring that, I could manually remove prompt words:
['car', 'driverless', 'drive', 'mars', 'driver', 'electoral', 'vote', 'president',
'state', 'venus', 'planet', 'earth', 'elector', 'election', 'phone', 'cell', 'technology',
'emotion', 'student', 'project', 'design', 'school', 'community', 'activity']

That's a good next step:
 - Should I also remove stop words? I think they are kind of interesting, but maybe removing 'of', 'and', 'the', 'a' would be good.
 - I will also try to find a number of topics that results in a good coherence score. I chose 8 for my first pass because that is the number of discourse_labels (including "unannotated")
 
 - get word2vec + doc2vec
 - classify discourse units based on vector representations
 - compare unseen vector to existing vectors
     - compare unseen vector to every single vector in corpus
     - create average vector for each discourse element
         - compare unseen vector to average