# Imports

In [125]:
import pymongo
from pymongo import MongoClient

import pandas as pd
import numpy as np
import re
import os
import json
import string
import math
from pandas.io.json import json_normalize

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer # or LancasterStemmer, RegexpStemmer, SnowballStemmer
from nltk.corpus import stopwords
import spacy
#from textblob import TextBlob as tb

import gensim
from gensim.models import TfidfModel
from gensim import corpora
from gensim.models.phrases import Phrases, Phraser
import pyLDAvis
from pyLDAvis import gensim as gensimvis #topic modeling

#import codecs
from sklearn import feature_extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#import mpld3

from pprint import pprint
import pickle

In [110]:
#show full cell contents
pd.set_option('display.max_colwidth',-1) 

# Make better use of Jupyter Notebook cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Mongo Data to Dataframe

In [28]:
config = {
    'host': 'ec2-18-191-25-42.us-east-2.compute.amazonaws.com', 
    'username':  'admin',
    'password': 'secure_password',
    'authSource': 'articles'
}

In [29]:
client = MongoClient(**config)

In [30]:
db = client.articles

In [43]:
client = MongoClient("mongodb://admin:secure_password@18.191.25.42:27017" + "/?authSource=admin")
db = client.articles

In [33]:
db.authenticate('admin', 'secure_password')

True

In [48]:
blogs = db.blogs

In [52]:
blogs.find_one({});

In [53]:
#save as dataframe
blogs_df = pd.DataFrame(list(blogs.find()))

In [58]:
blogs_df.to_pickle("./blogs_df.pkl")

# Data Preprocessing

## EDA

1. Remove punctuation 
2. Lowercase
3. Tokenize 
4. Stop words
5. Replace numbers
6. Lemmatization or Stemming

In [59]:
blogs_df = pd.read_pickle('blogs_df.pkl')

In [60]:
text = pd.DataFrame(blogs_df['text'])

In [69]:
text.head(1);

**Check for Null Values**

In [66]:
assert text['text'].isnull().count() == len(text['text'])

**Statistical Summary**

In [73]:
num_words = text['text'].apply(lambda x: len(x.split()))
num_words_mean, num_words_std = np.mean(num_words), np.std(num_words)

num_sentences = text['text'].apply(lambda x: len(re.split( '~ ...' ,'~'.join(x.split('.')))))
num_sentences_mean = np.mean(num_sentences)

In [105]:
#all_words = list(text['text'].str.lower().str.split(' ', expand=True).stack().unique())
all_words = list(set(text['text'].apply(lambda x: len(x.lower().split()))))

## Cleaning

1. Remove punctuation 
2. Lowercase
3. Tokenize 
4. Stop words
5. Replace numbers
6. Lemmatization or Stemming

In [112]:
#logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [117]:
text_arr = text.values

In [119]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        

clean_sents = list(sent_to_words(text_arr))

In [121]:
clean_sents;

## Bigram & Trigram Models

In [127]:
phrases = Phrases(clean_sents, min_count=1, threshold=1)

#phrases_trigram = Phrases(clean_sents, min_count=10)

In [128]:
bigram = Phraser(phrases)

In [132]:
#apple to text
for sent in bigram[clean_sents]:
    pass

In [136]:
file = open('clean_sents.pkl', 'wb')
pickle.dump(clean_sents, file)
file.close()

### Cleaning Function

In [26]:
#Helper function to clean text

default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english') # or any other list of your choice

REPLACE_BY_SPACE_RE = re.compile(r'[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile(r'[^0-9a-z #+_]')

def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        #tokens = tokenize_text(text)
        text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols (puncuation) by space in text
        text = BAD_SYMBOLS_RE.sub('', text)
        return text
        #pattern = re.compile('[{}]'.format(re.escape(characters)))
        #return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer= default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)

    text = text.strip(' ') # strip whitespaces
    text = text.lower() # lowercase
    text = stem_text(text) # stemming
    text = remove_special_characters(text) # remove punctuation and symbols
    text = remove_stopwords(text) # remove stopwords
    text = re.sub(r'\d+', 'num', text) #substitute numbers with 'num'
    #text.strip(' ') # strip whitespaces again??

    return text

In [27]:
text = text['text'].apply(lambda x: clean_text(x))

In [28]:
# helper function for tokenization
def tokenize(text):

    tokens = word_tokenize(text)
    
#     bigrams  = [' '.join(bigram) for bigram in nltk.bigrams(tokens)]
#     trigrams = [' '.join(trigram) for trigram in nltk.trigrams(tokens)]
    
    return tokens

In [29]:
tokens = text.apply(lambda x: tokenize(x))

In [30]:
token_arr = tokens.values

In [33]:
#token_arr[document index][work index]
#token_arr[2]

## Bag of Words

In [53]:
dictionary = gensim.corpora.Dictionary(token_arr)

count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count += 1
    #if count > 10:
       # break

0 abl
1 abroad
2 account
3 actor
4 adam
5 ago
6 alic
7 alist
8 allegi
9 alon
10 along
11 america
12 american
13 ami
14 amid
15 among
16 amount
17 ani
18 anim
19 anyon
20 anyth
21 appar
22 arguabl
23 assassin
24 averag
25 awaken
26 away
27 back
28 bad
29 becam
30 becaus
31 becom
32 befor
33 ben
34 benhur
35 bet
36 better
37 beyond
38 bfg
39 big
40 biggest
41 billion
42 billiondollar
43 book
44 box
45 brad
46 budget
47 built
48 butt
49 buy
50 came
51 camera
52 captain
53 cent
54 certainli
55 chang
56 chart
57 china
58 chri
59 chunk
60 civil
61 clear
62 colleg
63 compel
64 confus
65 cost
66 could
67 countri
68 cours
69 credit
70 creed
71 current
72 date
73 day
74 decemb
75 declar
76 denzel
77 depp
78 despit
79 determin
80 dirti
81 disney
82 diverg
83 doe
84 dog
85 domest
86 done
87 dori
88 dragon
89 draw
90 driven
91 dude
92 earli
93 earlier
94 earn
95 easier
96 eight
97 either
98 elba
99 emma
100 empir
101 enough
102 entertain
103 episod
104 estim
105 even
106 ever
107 everybodi
108 ever

In [54]:
#dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
dictionary.filter_extremes(no_below=2)

In [55]:
len(dictionary)

95

In [57]:
#Create dictionary for each document with words and frequency
bow_corpus = [dictionary.doc2bow(doc) for doc in token_arr]

In [59]:
bow_corpus;

## TF-IDF

In [70]:
tfidf = TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.04256282653793743),
 (1, 0.04256282653793743),
 (2, 0.04256282653793743),
 (3, 0.04256282653793743),
 (4, 0.04256282653793743),
 (5, 0.04256282653793743),
 (6, 0.04256282653793743),
 (7, 0.04256282653793743),
 (8, 0.08512565307587486),
 (9, 0.12768847961381227),
 (10, 0.04256282653793743),
 (11, 0.04256282653793743),
 (12, 0.04256282653793743),
 (13, 0.17025130615174972),
 (14, 0.04256282653793743),
 (15, 0.04256282653793743),
 (16, 0.04256282653793743),
 (17, 0.08512565307587486),
 (18, 0.04256282653793743),
 (19, 0.04256282653793743),
 (20, 0.04256282653793743),
 (21, 0.04256282653793743),
 (22, 0.04256282653793743),
 (23, 0.04256282653793743),
 (24, 0.04256282653793743),
 (25, 0.12768847961381227),
 (26, 0.04256282653793743),
 (27, 0.04256282653793743),
 (28, 0.04256282653793743),
 (29, 0.04256282653793743),
 (30, 0.04256282653793743),
 (31, 0.04256282653793743),
 (32, 0.12768847961381227),
 (33, 0.21281413268968716),
 (34, 0.04256282653793743),
 (35, 0.04256282653793743),
 (

In [None]:
# from gensim import corpora
# # Creating term dictionary of corpus, where each unique term is assigned an index. 
# dictionary = corpora.Dictionary(doc_clean)
 
# # Filter terms which occurs in less than 4 articles & more than 40% of the articles 
# #dictionary.filter_extremes(no_below=4, no_above=0.4)
 
# # List of few words which are removed from dictionary as they are content neutral
# stoplist = set('also use make people know many call include part find become like mean often different \
#                usually take wikt come give well get since type list say change see refer actually iii \
#                aisne kinds pas ask would way something need things want every str'.split())
# stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
# dictionary.filter_tokens(stop_ids)

# LDA (Bag of Words)

In [71]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [72]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.059*"million" + 0.024*"one" + 0.022*"still" + 0.019*"made" + 0.019*"market" + 0.018*"weekend" + 0.017*"current" + 0.017*"offic" + 0.016*"tast" + 0.015*"top"
Topic: 1 
Words: 0.025*"state" + 0.014*"work" + 0.012*"includ" + 0.012*"civil" + 0.012*"nigeria" + 0.012*"two" + 0.012*"way" + 0.012*"releas" + 0.011*"even" + 0.011*"ani"
Topic: 2 
Words: 0.116*"million" + 0.043*"still" + 0.037*"made" + 0.035*"one" + 0.029*"current" + 0.029*"top" + 0.028*"weekend" + 0.022*"like" + 0.022*"offic" + 0.021*"box"
Topic: 3 
Words: 0.011*"reach" + 0.011*"mark" + 0.011*"new" + 0.011*"saw" + 0.011*"score" + 0.011*"state" + 0.011*"million" + 0.011*"number" + 0.011*"come" + 0.011*"cent"
Topic: 4 
Words: 0.137*"state" + 0.044*"work" + 0.031*"nigeria" + 0.026*"two" + 0.026*"includ" + 0.020*"offic" + 0.020*"made" + 0.020*"civil" + 0.020*"even" + 0.020*"three"
Topic: 5 
Words: 0.088*"cent" + 0.087*"saw" + 0.085*"number" + 0.084*"new" + 0.031*"spend" + 0.031*"weekend" + 0.031*"run" + 0.031*"peri

# LDA (TF-IDF)

In [73]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.074*"mark" + 0.074*"reach" + 0.009*"score" + 0.009*"million" + 0.009*"state" + 0.009*"number" + 0.009*"saw" + 0.009*"new" + 0.009*"come" + 0.009*"cent"
Topic: 1 Word: 0.040*"cent" + 0.040*"new" + 0.040*"number" + 0.040*"saw" + 0.018*"fewer" + 0.018*"monday" + 0.018*"run" + 0.018*"reduc" + 0.018*"london" + 0.018*"compar"
Topic: 2 Word: 0.011*"mark" + 0.011*"reach" + 0.011*"score" + 0.011*"million" + 0.011*"state" + 0.011*"number" + 0.011*"saw" + 0.011*"new" + 0.011*"qualiti" + 0.011*"cent"
Topic: 3 Word: 0.011*"mark" + 0.011*"reach" + 0.011*"score" + 0.011*"number" + 0.011*"state" + 0.011*"saw" + 0.011*"million" + 0.011*"new" + 0.011*"cent" + 0.011*"veri"
Topic: 4 Word: 0.047*"score" + 0.044*"million" + 0.021*"top" + 0.021*"well" + 0.021*"qualiti" + 0.019*"still" + 0.019*"guy" + 0.018*"come" + 0.017*"one" + 0.017*"made"
Topic: 5 Word: 0.011*"mark" + 0.011*"reach" + 0.011*"score" + 0.011*"state" + 0.011*"saw" + 0.011*"million" + 0.011*"number" + 0.011*"new" + 0.011*"qual

In [76]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(text)

In [77]:
# Show feature matrix
bag_of_words.toarray();

In [78]:
# Get feature names
feature_names = count.get_feature_names()

In [79]:
# Create data frame
words = pd.DataFrame(bag_of_words.toarray(), index = posts_df['title'], columns=feature_names)

In [80]:
words

Unnamed: 0_level_0,aa,abdulaziz,abl,abroad,abubakar,abuja,academ,account,acrimoni,across,...,would,wrestl,yari,ye,year,yet,young,zamfara,zooland,zootopia
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The 8 Biggest Box Office Winners (and Losers) of 2016,0,0,1,1,0,0,0,2,0,0,...,1,0,0,1,14,1,0,0,1,2
"Mark Ingram reaches 1,000 rushing yards; first for Saints since 2006",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
&apos;Drastic drop&apos; in New Year&apos;s Day sales shoppers across UK as number of customers plummets by half,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,5,0,0,0,0,0
Iwobi bound to score more goals - Wenger,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
Rolling into 2017 Nigeria ‘stylee’,1,1,1,0,1,1,1,0,1,0,...,0,1,1,0,7,0,0,1,0,0


In [84]:
#functions for TD-IDF / BM25
def tf(word, doc):
    return doc.count(word) / len(doc)

def n_containing(word, doclist):
    return sum(1 for doc in doclist if word in doc)

def idf(word, doclist):
    return math.log(len(doclist) / (0.01 + n_containing(word, doclist)))

def tfidf(word, doc, doclist):
    return (tf(word, doc) * idf(word, doclist))

In [229]:
# Create dictonary of words

plottest = plot_data[0][0:1000]

worddic = {}

for doc in plottest:
    for word in wordsunique:
        if word in doc:
            word = str(word)
            index = plottest.index(doc)
            positions = list(np.where(np.array(plottest[index]) == word)[0])
            idfs = tfidf(word,doc,plottest)
            try:
                worddic[word].append([index,positions,idfs])
            except:
                worddic[word] = []
                worddic[word].append([index,positions,idfs])

In [85]:
# the index creates a dic with each word as a KEY and a list of doc indexs, word positions, and td-idf score as VALUES
#worddic['china']

In [None]:
# pickel (save) the dictonary to avoid re-calculating
np.save('worddic.npy', worddic)