# Gensim Processing

In [4]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords, words
from nltk.stem import WordNetLemmatizer
import gensim
from collections import defaultdict
import contractions
import re
import string

In [5]:
eng_dict = set(words.words())
lemma = WordNetLemmatizer()
stops = set(stopwords.words('english'))

In [8]:
comments2 = pd.read_csv('../data/comments.csv', sep='|')
comments2.dropna(inplace=True)
text = list(comments2['text'])
strip_punctuation = re.compile('[%s]' % re.escape(string.punctuation))

In [9]:
# def expand_contractions(corpus):
#     expanded = [contractions.fix(comment.lower()) for comment in corpus]
#     return expanded

In [10]:
# def strip_punctuation(corpus):
#     strip_punc = re.compile('[%s]' % re.escape(string.punctuation))
#     no_punc = [strip_punc.sub('', comment) for comment in corpus]
#     return no_punc

In [65]:
class ProcessCorpus():
    '''
    Class for NLP processign to create a corpus that has:
      - contractions expanded
      - punctuation stripped
      - lemmatizaion
    There is an optional frequency filter included to remove words that appear less than some number of times.

    This class has stored attributes for a dictionary and corpus object to be supplied to gensim.
    '''

    def __init__(self, stopwords=None, eng_dict=None):
        self.stopwords = stopwords
        self.eng_dict = eng_dict
    
    def __expand_contractions(self):
        '''
        Expands contraction words such as don't --> do not prior to removing punctuation so meaning isn't lost and words will still appear in supplied dictionary for dictionary filtering.
        '''
        expanded = [contractions.fix(comment.lower()) for comment in self.corpus]
        return expanded

    def __strip_punctuation(self):
        '''
        Removes all punctuation from corpus.
        '''
        strip_punc = re.compile('[%s]' % re.escape(string.punctuation))
        no_punc = [strip_punc.sub('', comment) for comment in self.corpus]
        return no_punc

    def __lemmatize(self):
        '''
        Lemmatizes words using NLTK package's WordNetLemmatizer.
        '''
        split = []
        lemma = WordNetLemmatizer()
        for comment in self.corpus:
            split.append([lemma.lemmatize(word) for word in comment.split() if word not in self.stopwords])
        return split

    def __frequency_dict(self):
        '''
        Creates a dictionary of word frequencies in the corpus to be used for frequency filtering. Good for removing typos, and words that are not frequent enough to be considered slang due to low adoption by subculture.
        '''
        self.frequency_dict = defaultdict(int)
        for text in self.corpus:
            for token in text:
                self.frequency_dict[token] += 1
        
    def __filter_english(self):
        self.no_eng = [
            [word for word in comment
            if word not in self.eng_dict]
            for comment in self.corpus
        ]


    def fit(self, corpus):
        '''
        Call to process corpus.

        Creates attributes:
        corpus - processed
        gensim_dictionary
        '''
        self.corpus = corpus
        self.corpus = self.__expand_contractions()
        self.corpus = self.__strip_punctuation()
        self.corpus = self.__lemmatize()
        self.__frequency_dict()
        self.gensim_dictionary = gensim.corpora.Dictionary(self.corpus)
        temp = self.gensim_dictionary[0]
        self.gensim_corpus = [self.gensim_dictionary.doc2bow(text) for text in self.corpus]
        
        self.__filter_english()
        self.gensim_dictionary_no_english = gensim.corpora.Dictionary(self.no_eng)
        temp = self.gensim_dictionary_no_english[0]
        self.gensim_corpus_no_english = [self.gensim_dictionary_no_english.doc2bow(text) for text in self.no_eng]

    def return_frequency_filter(self, frequency = 1):
        freq_filter = [
            [token for token in text if self.frequency_dict[token] > frequency]
            for text in self.corpus
        ]
        return freq_filter
    
    def apply_frequency_filter(self, frequency = 1):
        self.corpus_freq = self.return_frequency_filter(frequency)
        self.gensim_dictionary_freq = gensim.corpora.Dictionary(self.corpus)
        temp = self.gensim_dictionary_freq[0]
        self.gensim_corpus_freq = [self.gensim_dictionary.doc2bow(text) for text in self.corpus]



In [23]:
dictionary = gensim.corpora.Dictionary(corpus.corpus)
c = [dictionary.doc2bow(text) for text in corpus.corpus]

In [66]:
corpus = ProcessCorpus(stopwords=stops, eng_dict=eng_dict)

In [67]:
corpus.fit(text)

In [48]:
corpus.apply_frequency_filter(5)

In [49]:
model = gensim.models.LdaModel(
    corpus=corpus.gensim_corpus,
    id2word=corpus.gensim_dictionary.id2token,
    chunksize=2000,
    alpha='auto',
    eta='auto',
    iterations=400,
    num_topics=10,
    passes=20,
    eval_every=1
)

In [88]:
model.get_topics().shape

(10, 120600)

In [50]:
model.save('mdl/model2/full_corpus_model')

In [81]:
toptopics = model.top_topics(corpus.gensim_corpus)

In [84]:
toptopics[0]

([(0.027945692, 'I'),
  (0.020974234, 'would'),
  (0.016574621, 'like'),
  (0.01620022, 'year'),
  (0.015440419, 'people'),
  (0.013655773, 'get'),
  (0.012080382, 'time'),
  (0.011261409, 'think'),
  (0.010394038, 'one'),
  (0.00971735, 'make'),
  (0.009337797, 'work'),
  (0.008740346, 'much'),
  (0.008324937, 'going'),
  (0.008266438, 'good'),
  (0.008090082, 'go'),
  (0.0080230115, 'want'),
  (0.007572819, 'know'),
  (0.0073177647, 'even'),
  (0.007150487, 'really'),
  (0.0070460634, 'need')],
 -2.1333582068623778)