In [1]:
import re
import os
import string
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
folder_path = "./twitterdata/"

user_timeline = pd.read_csv(folder_path + 'Hwoodmoviegeek_timeline.csv', encoding='utf-8')

filtered_timeline = user_timeline[(user_timeline.isRT == False) & (user_timeline.lang == 'en')]

filtered_timeline

Unnamed: 0,id,isRT,time,lang,text
0,1196130948628152321,False,17.11.2019 18:20:15,en,@bleedingcool Man of steel?
1,1195991070393257984,False,17.11.2019 09:04:26,en,@IMDb Doctor strange
4,1195478171111444481,False,15.11.2019 23:06:21,en,@TwitterMovies What are both of yours next pro...
5,1195252904447238145,False,15.11.2019 08:11:13,en,@TheEllenShow @SteveSpangler @andylassner He s...
6,1195252811535015936,False,15.11.2019 08:10:51,en,@andylassner @TheEllenShow @SteveSpangler You ...
...,...,...,...,...,...
82,1209241280263229440,False,23.12.2019 22:36:02,en,@Fandango Escape room
83,1209108460660412416,False,23.12.2019 13:48:15,en,@TysMae @randymoncesart @johncampea That's nyc...
84,1209076525347131392,False,23.12.2019 11:41:21,en,@randymoncesart @johncampea So is it fun for p...
85,1208821162903400448,False,22.12.2019 18:46:38,en,@johncampea So can it be new got???


In [3]:
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')
 
filtered_timeline.loc[:,'text'] = filtered_timeline['text'].apply(lambda x: x.lower())
filtered_timeline.loc[:,'text'] = filtered_timeline['text'].apply(lambda x: re.sub(r'(^|[^@\w])@(\w{1,15})\b', '', x)) #user tags
filtered_timeline.loc[:,'text'] = filtered_timeline['text'].apply(lambda x: re.sub(r'(^|[^@\w])#(\w{1,15})\b', '', x)) #user tags
filtered_timeline.loc[:,'text'] = filtered_timeline['text'].apply(lambda x: re.sub(r'http\S+',  '', x))   #urls 
filtered_timeline.loc[:,'text'] = filtered_timeline['text'].apply(lambda x: x.replace('\n','')) 
filtered_timeline.loc[:,'text'] = filtered_timeline['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
filtered_timeline.loc[:,'text'] = filtered_timeline['text'].apply(deEmojify) #emojis
filtered_timeline = filtered_timeline.drop(filtered_timeline[filtered_timeline.text == ''].index) #delete empty strings
filtered_timeline['text'].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


array([' man of steel ', ' doctor strange',
       ' what are both of yours next projects ',
       ' he should get a raise man', ' you should get a raise dude',
       ' i think   nailed it    it was the best episode',
       ' did anyone masturbate at this   ',
       ' i totally love animated movies too',
       ' the updated opening weekend was 96 million dollars',
       ' once my mom wanted to have dinner with me  i said i have a date tonight  she said   yeah right ',
       ' inception hands down',
       ' man of steel  infinity war  the dark knight rises',
       ' breaking bad finale',
       ' yeah but i think it is the highest without chinese release',
       ' it was fine', ' stop my parents from having me ', ' prisoners',
       ' the avengers', ' inception',
       ' harry potter and the deathly hallows part 2',
       ' obviously inception', ' inception', ' inception period',
       'ohh  would love if that would happen wouldn t he   ',
       ' chewbaccha', ' escape ro

In [4]:
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from pywsd.utils import lemmatize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words.add('didnt')
stop_words.add('dont')
stop_words.add('youre')
stop_words.add('im')

def tokenization_w(texts):
    tokenized_texts = []
    for text in texts:
        w_token = word_tokenize(text)
        filtered_sentence = [w for w in w_token]
        tokenized_texts.append(filtered_sentence)
    return tokenized_texts

def lemmatization(stem_array):
    lemmatized = []
    for stems in stem_array:
        lemmas = [lemmatize(x) for x in stems if not x in stop_words]
        lemmatized.append(lemmas)
    return lemmatized

tokens = tokenization_w(filtered_timeline['text'])
lemmatized_data = lemmatization(tokens)
data = [' '.join(list) for list in lemmatized_data]
data

Warming up PyWSD (takes ~10 secs)... took 5.551051616668701 secs.


['man steel',
 'doctor strange',
 'next project',
 'get raise man',
 'get raise dude',
 'think nail best episode',
 'anyone masturbate',
 'totally love animate movie',
 'update opening weekend 96 million dollar',
 'mom want dinner say date tonight say yeah right',
 'inception hand',
 'man steel infinity war dark knight rise',
 'break bad finale',
 'yeah think high without chinese release',
 'fine',
 'stop parent',
 'prisoner',
 'avenger',
 'inception',
 'harry potter deathly hallows part 2',
 'obviously inception',
 'inception',
 'inception period',
 'ohh would love would happen',
 'chewbaccha',
 'escape room lvoe genre',
 'dude sleep wait movie everyday day die watch',
 'michael scott rachel joey phil dunphy',
 'think second well 1st',
 'inception',
 'deserve',
 'unusually crazy',
 'hostel',
 'playmobil',
 'inception',
 'music fantaaaaaaastic',
 'music fantaaaaaaasticc',
 'best movie time',
 'escape room',
 'reaction',
 'money heist',
 'lion king win whether like visual effect beyond 

In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(bigram_mod[data_words[0]])

['man', 'steel']


In [7]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

data_lemmatized

[[],
 ['strange'],
 ['next', 'project'],
 ['raise', 'man'],
 ['raise', 'dude'],
 ['think', 'good', 'episode'],
 [],
 ['totally', 'love', 'animate', 'movie'],
 ['update', 'opening', 'weekend', 'dollar'],
 ['mom', 'want', 'dinner', 'say', 'date', 'tonight', 'say'],
 [],
 [],
 ['break', 'bad', 'finale'],
 ['think', 'high', 'chinese', 'release'],
 ['fine'],
 ['stop', 'parent'],
 ['prisoner'],
 [],
 ['inception'],
 ['hallow', 'part'],
 ['obviously', 'inception'],
 ['inception'],
 ['inception', 'period'],
 ['would', 'love', 'would', 'happen'],
 [],
 ['room'],
 ['wait', 'movie', 'everyday', 'day', 'die', 'watch'],
 [],
 ['think', 'second', 'well'],
 ['inception'],
 ['deserve'],
 ['unusually', 'crazy'],
 [],
 [],
 ['inception'],
 ['music'],
 ['music'],
 ['good', 'movie', 'time'],
 ['room'],
 ['reaction'],
 ['money'],
 ['win', 'visual', 'effect', 'stun'],
 ['dude'],
 [],
 ['love', 'war', 'movie', 'even', 'original'],
 ['film', 'single', 'year', 'beat'],
 ['pure', 'class'],
 ['brilliant'],
 ['in

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [9]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.056*"say" + 0.039*"even" + 0.021*"dinner" + 0.021*"present" + '
  '0.021*"marvel" + 0.021*"mom" + 0.021*"otherwise" + 0.021*"video" + '
  '0.021*"hope" + 0.021*"date"'),
 (1,
  '0.154*"inception" + 0.023*"movie" + 0.023*"raise" + 0.023*"film" + '
  '0.023*"think" + 0.023*"everyday" + 0.023*"watch" + 0.023*"die" + '
  '0.023*"wait" + 0.023*"year"'),
 (2,
  '0.068*"room" + 0.047*"people" + 0.026*"bad" + 0.026*"love" + 0.026*"want" + '
  '0.026*"opinion" + 0.026*"rank" + 0.026*"respect" + 0.026*"really" + '
  '0.026*"classic"'),
 (3,
  '0.053*"good" + 0.053*"think" + 0.053*"movie" + 0.029*"episode" + '
  '0.029*"break" + 0.029*"love" + 0.029*"well" + 0.029*"finale" + '
  '0.029*"experience" + 0.029*"second"'),
 (4,
  '0.045*"would" + 0.045*"dude" + 0.045*"music" + 0.025*"love" + 0.025*"want" '
  '+ 0.025*"dollar" + 0.025*"see" + 0.025*"people" + 0.025*"look" + '
  '0.025*"work"')]


In [10]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'didnt',
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'dont',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'im',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 '

In [11]:
x=lda_model.show_topics(num_topics=8, num_words=10,formatted=False)
topics_words = [[wd[0] for wd in tp[1]] for tp in x]

topics = []
for words in topics_words:
    topics.append(words)
    
topics

[['say',
  'even',
  'dinner',
  'present',
  'marvel',
  'mom',
  'otherwise',
  'video',
  'hope',
  'date'],
 ['inception',
  'movie',
  'raise',
  'film',
  'think',
  'everyday',
  'watch',
  'die',
  'wait',
  'year'],
 ['room',
  'people',
  'bad',
  'love',
  'want',
  'opinion',
  'rank',
  'respect',
  'really',
  'classic'],
 ['good',
  'think',
  'movie',
  'episode',
  'break',
  'love',
  'well',
  'finale',
  'experience',
  'second'],
 ['would',
  'dude',
  'music',
  'love',
  'want',
  'dollar',
  'see',
  'people',
  'look',
  'work']]