In [3]:
import pandas as pd
#import modules and test files

test_set = pd.read_csv("test.txt", sep=";")
train_set = pd.read_csv("train.txt", sep=";")
val_set = pd.read_csv("val.txt", sep=";")
val_set.columns = ["text","sentiment"]
val_set.head()

Unnamed: 0,text,sentiment
0,i feel like i am still looking at a blank canv...,sadness
1,i feel like a faithful servant,love
2,i am just feeling cranky and blue,anger
3,i can have for a treat or if i am feeling festive,joy
4,i start to feel more appreciative of what god ...,joy


In [5]:
#txt requirements
# pandas
# numpy
# spacy>=2.2.4
# nltk>=3.4.5
# gensim>=3.8.3
# plotnine>=0.6.0
# tomotopy>=0.7.1
# wordcloud>=1.7.0

import numpy as np 
import spacy
import nltk as nltk
import gensim
import plotnine
import tomotopy
# import wordcloud
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
st_words = stopwords.words('english')
extra_stops=['from','subject','re', 'edu','use']
st_words.extend(extra_stops)

# stop word removal
for sentence in val_set["text"]:
    for word in sentence.split():
        if word not in st_words:
            print("Kept ==> " +word)
        else:
            print("Removed: " +word)

Removed: i
Kept ==> feel
Kept ==> like
Removed: i
Removed: am
Kept ==> still
Kept ==> looking
Removed: at
Removed: a
Kept ==> blank
Kept ==> canvas
Kept ==> blank
Kept ==> pieces
Removed: of
Kept ==> paper
Removed: i
Kept ==> feel
Kept ==> like
Removed: a
Kept ==> faithful
Kept ==> servant
Removed: i
Removed: am
Removed: just
Kept ==> feeling
Kept ==> cranky
Removed: and
Kept ==> blue
Removed: i
Removed: can
Removed: have
Removed: for
Removed: a
Kept ==> treat
Removed: or
Removed: if
Removed: i
Removed: am
Kept ==> feeling
Kept ==> festive
Removed: i
Kept ==> start
Removed: to
Kept ==> feel
Removed: more
Kept ==> appreciative
Removed: of
Removed: what
Kept ==> god
Removed: has
Kept ==> done
Removed: for
Removed: me
Removed: i
Removed: am
Kept ==> feeling
Removed: more
Kept ==> confident
Removed: that
Removed: we
Removed: will
Removed: be
Kept ==> able
Removed: to
Kept ==> take
Kept ==> care
Removed: of
Removed: this
Kept ==> baby
Removed: i
Kept ==> feel
Kept ==> incredibly
Kept ==> lu

In [7]:
spC = spacy.load('en_core_web_sm')
for sentence in val_set["text"]:
    lemma_sentence = spC(sentence)
    for token in lemma_sentence:
        text = token.text
        lemma_token = token.lemma_
        pos = token.pos_
        print(token.text + " ==> " +token.lemma_ + ", " + token.pos_)
        
    
        

i ==> I, PRON
feel ==> feel, VERB
like ==> like, SCONJ
i ==> I, PRON
am ==> be, AUX
still ==> still, ADV
looking ==> look, VERB
at ==> at, ADP
a ==> a, DET
blank ==> blank, ADJ
canvas ==> canvas, NOUN
blank ==> blank, ADJ
pieces ==> piece, NOUN
of ==> of, ADP
paper ==> paper, NOUN
i ==> I, PRON
feel ==> feel, VERB
like ==> like, ADP
a ==> a, DET
faithful ==> faithful, ADJ
servant ==> servant, NOUN
i ==> I, PRON
am ==> be, AUX
just ==> just, ADV
feeling ==> feel, VERB
cranky ==> cranky, ADJ
and ==> and, CCONJ
blue ==> blue, ADJ
i ==> I, PRON
can ==> can, AUX
have ==> have, VERB
for ==> for, ADP
a ==> a, DET
treat ==> treat, NOUN
or ==> or, CCONJ
if ==> if, SCONJ
i ==> I, PRON
am ==> be, AUX
feeling ==> feel, VERB
festive ==> festive, ADJ
i ==> I, PRON
start ==> start, VERB
to ==> to, PART
feel ==> feel, VERB
more ==> more, ADV
appreciative ==> appreciative, ADJ
of ==> of, ADP
what ==> what, PRON
god ==> god, PROPN
has ==> have, AUX
done ==> do, VERB
for ==> for, ADP
me ==> I, PRON
i ==>

In [None]:
#TODO: preprocess to bigrams then you can start making hdp model

In [None]:
def run_preprocess(news, min_token_len=3, rm_accent=True, bigram_min_cnt=5, bigram_thresh=100,
                   extra_stops=['from','subject','re', 'edu','use'],
                   postags=['NOUN','VERB','ADV','ADJ']):

    '''Function wrapper to preprocess the 20Newsgroup dataset and generate ready to model results
    
    *** Inputs**
    news:obj -> 20Newsgroup object from sklearn (i.e. 20fetch...)
    min_token_len: int -> tokens less than this number are excluded during tokenization
    rm_accent : bool -> flag whether to remove deaccents
    bigram_min_cnt: int -> ignore all words and bigrams with total collected count lower than this value
    bigram_thresh: int -> threshold for building phrases, higher means fewer phrases
    extra_stops: list -> extra stopwords to ignore asidr from NLTK default
    postags:list -> words/bigrams to include based on POS (part-of-speech)
    
    ** Returns**
    df: Master df with 20newgroup data and labels
    word_list_lemmatized: list -> list of lists w/ lemmatized bigrams 
    '''
    
    ### Setting up stopwords and Spacy
    nltk.download('stopwords', quiet=True)
    st_words = stopwords.words('english')
    st_words.extend(extra_stops)
    
    # Build master dataframe
    df = pd.DataFrame([news.target, news.data]).T
    df = df.set_index(0)

    df = pd.concat([df, pd.Series(news.target_names)],axis=1, join="inner")
    df.reset_index(inplace=True)
    df.columns = ["topic_id", "content", "topic_name"]

    # Convert values to list
    doc_list = df.content.values.tolist()

    # Remove email signs, newlines, single quotes
    doc_list = [re.sub(r'\S*@\S*\s?', '', txt) for txt in doc_list]
    doc_list = [re.sub(r'\s+', ' ', txt) for txt in doc_list]
    doc_list = [re.sub(r"\'", "", txt) for txt in doc_list]

    # Tokenize based on min_token_len and deaccent flags
    print("Tokenizing...\n")
    word_list = [simple_preprocess(txt, deacc=rm_accent, min_len=min_token_len) for txt in doc_list]
     
    # Create bigram models
    bigram = Phrases(word_list, min_count=bigram_min_cnt, threshold=bigram_thresh) # use original wordlist to build model
    bigram_model = Phraser(bigram)
    
    # Remove stopwords
    print("Removing Stopwords...\n")
    word_list_nostops = [[word for word in txt if word not in st_words] for txt in word_list]
    
    # Implement bigram models
    print("Create bigrams...\n")
    word_bigrams = [bigram_model[w_vec] for w_vec in word_list_nostops] # implement it in the list w/ no stopwords
    
    # Lemmatize POS-tags to keep
    print("Lemmatizing, keeping " + ",".join(postags)+ " POS tags...\n")
    word_list_lemmatized = lemmatize(word_bigrams, ptags=postags)

    print("Done preprocessing " + str(df.shape[0]) + " documents")
    return df, word_list_lemmatized
    

# Helper function    
def lemmatize(word_list, ptags):
    '''Lemmatizes words based on allowed postags, input format is list of sublists 
       with strings'''
    spC = spacy.load('en_core_web_sm')
    lem_lists =[]
    for vec in word_list:
        sentence = spC(" ".join(vec))
        lem_lists.append([token.lemma_ for token in sentence if token.pos_ in ptags])
    
    return lem_lists

In [None]:
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import sys

def train_HDPmodel(hdp, word_list, mcmc_iter, burn_in=100, quiet=False):
    '''Wrapper function to train tomotopy HDP Model object
    
    *** Inputs**
    hdp: obj -> initialized HDPModel model
    word_list: list -> lemmatized word list of lists
    mcmc_iter : int -> number of iterations to train the model
    burn_in: int -> MC burn in iterations
    quiet: bool -> flag whether to print iteration LL and Topics, if True nothing prints out
    
    ** Returns**
    hdp: trained HDP Model 
    '''
    
    # Add docs to train
    for vec in word_list:
        hdp.add_doc(vec)

    # Initiate MCMC burn-in 
    hdp.burn_in = 100
    hdp.train(0)
    print('Num docs:', len(hdp.docs), ', Vocab size:', hdp.num_vocabs, ', Num words:', hdp.num_words)
    print('Removed top words:', hdp.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)

    # Train model
    step=round(mcmc_iter*0.10)
    for i in range(0, mcmc_iter, step):
        hdp.train(step, workers=3)
        if not quiet:
            print('Iteration: {}\tLog-likelihood: {}\tNum. of topics: {}'.format(i, hdp.ll_per_word, hdp.live_k))
        
    print("Done\n")  
    
    return hdp
    
        
def get_hdp_topics(hdp, top_n=10):
    '''Wrapper function to extract topics from trained tomotopy HDP model 
    
    ** Inputs **
    hdp:obj -> HDPModel trained model
    top_n: int -> top n words in topic based on frequencies
    
    ** Returns **
    topics: dict -> per topic, an arrays with top words and associated frequencies 
    '''
    
    # Get most important topics by # of times they were assigned (i.e. counts)
    sorted_topics = [k for k, v in sorted(enumerate(hdp.get_count_by_topics()), key=lambda x:x[1], reverse=True)]

    topics=dict()
    
    # For topics found, extract only those that are still assigned
    for k in sorted_topics:
        if not hdp.is_live_topic(k): continue # remove un-assigned topics at the end (i.e. not alive)
        topic_wp =[]
        for word, prob in hdp.get_topic_words(k, top_n=top_n):
            topic_wp.append((word, prob))

        topics[k] = topic_wp # store topic word/frequency array
        
    return topics