In [1]:
import re
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy



In [2]:
# NLTK Stop words
import nltk

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [3]:
npr = pd.read_csv('npr1.csv')

In [4]:
npr.head()

Unnamed: 0,Article
0,Older women who look on the bright side of lif...
1,"In Bangladesh, a new report finds, impoverishe..."
2,"When he first moved to Miami, Waltter Teruel s..."
3,"When ATT, a leading Internet provider, propose..."
4,Donald Trump is on a tour of battleground st...


In [5]:
# Convert to list
data = npr.values.tolist()
len(data)


509

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [7]:
len(data_words)

509

In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [9]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
nlp = spacy.load("en_core_web_sm",disable=['parser']) 

##Adding additional stopwords
new_stopwords= ['dear', 'thanks','regards', 'hello','hi', 'bye','goodbye', 'say']
for word in new_stopwords:
    nlp.Defaults.stop_words.add(word)
    nlp.vocab[word].is_stop = True
    
def remove_stopwords_spacy(texts):
    return [[word.text for word in nlp(str(text)) if not word.is_stop] for text in texts]
    #return [[word.orth_ for word in nlp(str(text)) if word not in nlp.Defaults.stop_words] for text in texts]

def remove_stopwords_gensim(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words

data_words_nostops = remove_stopwords_spacy(data_words)
#data_words_nostops = remove_stopwords_gensim(data_words_nostops)

#remove words that only occur once to make process faster
all_tokens = sum(data_words_nostops, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
text_no_single_words = [[term for term in words if term not in tokens_once] for words in data_words_nostops]

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus 
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]



In [12]:
#Getting the number of topics for LDA using HDP
#hdp_model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=id2word)
#hdp_topics= hdp_model.print_topics()
#for topic in hdp_topics:
    #print(topic)


In [25]:
# Build LDA model
coherence_score=[]
perplexity_score=[]
for i in range(1,21):
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=i, 
                                               random_state=110,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=False,
                                               minimum_probability=0.0)

    perplexity_score.append(lda_model.log_perplexity(corpus))
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_score.append(coherence_model_lda.get_coherence())

print('Coherence Score', coherence_score)

print('Perplexity Score',perplexity_score)

#from pprint import pprint
#pprint(lda_model.print_topics())


Coherence Score [0.2544804347153319, 0.28176149636514136, 0.3014145047721189, 0.3502119920451648, 0.3874874672026672, 0.3627511320034637, 0.38055339667098886, 0.37687023913139384, 0.4103855876351199, 0.42085411453099136, 0.3962621878827782, 0.4272256673251504, 0.49276949372165835, 0.4391421070664864, 0.4507258092221508, 0.45156901431903307, 0.4521610378758477, 0.4584206342440234, 0.4294496576449215, 0.4324650168580141]
Perplexity Score [-8.477639622212628, -8.419906712834612, -8.405121608313172, -8.391348118555813, -8.38818905923543, -8.39845366588185, -8.403582022998092, -8.45347183295547, -8.59350276489136, -8.787612238639518, -9.15746067527672, -9.4857831873928, -9.895573895662746, -10.214646969235881, -10.385783259104166, -10.54412929596745, -10.71462136692859, -10.868532369018276, -10.998548166159186, -11.158216355984814]


In [29]:
f = lambda i: coherence_score[i]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=max(range(len(coherence_score)), key=f)+1, 
                                               random_state=110,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=False,
                                               minimum_probability=0.0)


max(range(len(coherence_score)), key=f)+1
      
      

13

In [30]:
#putting into dataframe the assinged topic of each document/email
from itertools import chain
lda_corpus = lda_model[corpus]

all_topics=lda_model.get_document_topics(corpus)
print(all_topics[0])

scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))
threshold = sum(scores)/len(scores)


all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
categorized_docs=[]

for document in all_topics_numpy:
    if max(document) > threshold:
        categorized_docs.append(document.argmax())
    else:
        categorized_docs.append('Unmarked')

npr = npr.assign(Topic = categorized_docs)
 
#npr['Topic'] = all_topics_numpy.argmax(axis=1)

[(0, 0.13778025), (1, 0.0001236082), (2, 0.18513234), (3, 0.09968137), (4, 0.0015244875), (5, 0.0005962362), (6, 0.0005161454), (7, 3.2855874e-05), (8, 0.0002037737), (9, 0.00019994486), (10, 0.00023227147), (11, 0.0002154149), (12, 0.57376134)]


In [31]:
print(npr.head())
npr['Topic'].value_counts(dropna=False)

                                             Article  Topic  Category
0  Older women who look on the bright side of lif...     12    Health
1  In Bangladesh, a new report finds, impoverishe...      3      Work
2  When he first moved to Miami, Waltter Teruel s...     12      Work
3  When ATT, a leading Internet provider, propose...      3      Work
4    Donald Trump is on a tour of battleground st...      3  Politics


3     252
12    122
0      36
5      34
4      32
11     10
2       8
8       7
6       6
10      2
Name: Topic, dtype: int64

In [32]:
##After getting making the LDA model now we will take top twenty words from each topic.
##We will further also make a list containing top twenty words for each topic (its a list of lists)

topics_with_words=[] #list that contains words belonging to each topic
topic_num_list=[] #topic numbers
for i in range(topic_num):
    tt = lda_model.get_topic_terms(i,20)
    topics_with_words.append([id2word[pair[0]] for pair in tt]) #filling list with each topics word
    topic_num_list.append(i) #filling up the topic number list   
    
#removing stopwords again and removing words that are not in the vocab     
for topic in topics_with_words:
    topic= [word for word in topic if not nlp.vocab.has_vector(word)]
    #cleanText = " ".join([token.text for token in tokens if token.has_vector]) 
    topic= [word for word in topic if not nlp.vocab[word].is_stop]       
topic_dict = dict(zip(topic_num_list, topics_with_words))#buliding a dictionary with list of topic numbers and list of words in those topics
topic_dict

{0: ['say',
  'police',
  'officer',
  'case',
  'report',
  'china',
  'year',
  'law',
  'kill',
  'death',
  'number',
  'official',
  'enforcement',
  'authority',
  'attack',
  'court',
  'government',
  'russia',
  'incident',
  'chinese'],
 1: ['courage',
  'grief',
  'pulse',
  'countless',
  'wit',
  'stump',
  'brandon',
  'glenn',
  'jet',
  'equality',
  'dance',
  'atlantic',
  'golden',
  'pearl',
  'dancer',
  'production',
  'float',
  'feb',
  'invitation',
  'legacy'],
 2: ['health',
  'child',
  'abortion',
  'zika',
  'woman',
  'service',
  'percent',
  'mojica',
  'patient',
  'program',
  'virus',
  'rico',
  'department',
  'rate',
  'care',
  'medical',
  'hospital',
  'plan',
  'disability',
  'special'],
 3: ['say',
  'time',
  'year',
  'new',
  'tell',
  'know',
  'day',
  'want',
  'go',
  'think',
  'people',
  'work',
  'come',
  'trump',
  'way',
  'write',
  'president',
  'word',
  'thing',
  'address'],
 4: ['say',
  'year',
  'people',
  'panda',
  

In [33]:
#function that takes in topic words and a category and returns similary between the two
#topic is a list and category should be passsed in as a spacy token
def calculate_similarity(topic_words, category):
    total_sim=0
    string=''
    #print("This is the string when we enter the function: {}".format(string))
    string = ' '.join([str(elem) for elem in topic_words])## this is the string that contains every word in that topic
    #print("This is the string when we fill it up: {}".format(string))
    doc= spacy_model(string) ##topic's string is used created a spacy doc
    for token in doc:
        total_sim+= token.similarity(category)
    return total_sim    
    

In [34]:
nlp = spacy.load('en_core_web_sm')
nlp.vocab.has_vector("epa")

False

In [35]:
##Here we firstly have a list of categories one of which will be assigned a category
##Here we calculate similarity of each of the top twenty words of a topic with each category, 
##sum up similarity index for each category for that topic and assign it with the highest one


spacy_model =spacy.load('en_core_web_md')
from spacy.tokenizer import Tokenizer


tokenizer = Tokenizer(spacy_model.vocab)
#list_of_categories= ['Personal', 'Professional', 'Social','Sports','Study','Health', 'Politics', 'Business', 'Science','Music']
list_of_categories =['Personal','Work' ,'Studies', 'Meeting', 'School', 'Bussiness', 'News', 'Subscriptions', 'Marketing', 'Clients',
                     'Sports', 'Extracurricular', 'Health', 'Travel', 'Schedule', 'Photography',
                    'Politics', 'Cooking', 'Fashion', 'Social', 'Fitness', 'Research', 'Science', 'Technology', 'Sales',
                    'Shopping', 'Economy', 'Finance', 'Music', 'Family', 'College', 'Bills', 'Games', 'Design', 'Advertisements',
                    'Reviews']
string_cat=' '.join([str(elem) for elem in list_of_categories])
categories = tokenizer(string_cat)

df = pd.DataFrame(0,index=topic_num_list,columns=list_of_categories)

for category in categories:
    for key, topic_words in topic_dict.items():
        df.loc[key,category.text] = calculate_similarity(topic_words, category)
        #print(key, category.text)


for value in topic_dict.values():
    
    print(value)
    print('\n')

df

######################################### Things that are left to do:  ################### 

#Run model iteratively until corellation of each topic with a category is above 8 (or has run 10 times)
#Run model iteratively until all topics have a seperate category (or has run 10 times)
#Run model iteratively until we get the best coherence 



#There need to be rules on how much a topic's correlation with a category needs to be for it to be put into it 

#We can do something like after clustering if even one of the topics has similarity with every category less thatn 8 we repeat the 
#clustering and we repeat it till all topics have similarity with one of the categories

#What happens when two topics highly corelated with a single category, that will probably not happen if the list of categories is large

#Try doing that thing where they keep repeating LDA until it has a certain coherence score (you have the saved in OneTab I think)



 

  total_sim+= token.similarity(category)


['say', 'police', 'officer', 'case', 'report', 'china', 'year', 'law', 'kill', 'death', 'number', 'official', 'enforcement', 'authority', 'attack', 'court', 'government', 'russia', 'incident', 'chinese']


['courage', 'grief', 'pulse', 'countless', 'wit', 'stump', 'brandon', 'glenn', 'jet', 'equality', 'dance', 'atlantic', 'golden', 'pearl', 'dancer', 'production', 'float', 'feb', 'invitation', 'legacy']


['health', 'child', 'abortion', 'zika', 'woman', 'service', 'percent', 'mojica', 'patient', 'program', 'virus', 'rico', 'department', 'rate', 'care', 'medical', 'hospital', 'plan', 'disability', 'special']


['say', 'time', 'year', 'new', 'tell', 'know', 'day', 'want', 'go', 'think', 'people', 'work', 'come', 'trump', 'way', 'write', 'president', 'word', 'thing', 'address']


['say', 'year', 'people', 'panda', 'community', 'resident', 'pan', 'home', 'family', 'city', 'local', 'muslim', 'story', 'day', 'church', 'work', 'car', 'son', 'know', 'conservation']


['trump', 'election', 'vo

Unnamed: 0,Personal,Work,Studies,Meeting,School,Bussiness,News,Subscriptions,Marketing,Clients,...,Economy,Finance,Music,Family,College,Bills,Games,Design,Advertisements,Reviews
0,5.922018,5.867961,5.000954,5.490351,6.148301,5.441613,5.91479,1.879558,3.822869,4.235974,...,5.02385,5.183203,3.77937,5.361845,5.124218,5.183096,3.625114,3.316906,2.809899,2.550122
1,3.832694,3.845042,2.61265,2.576775,3.720851,3.29185,2.880136,1.03495,2.567267,2.384122,...,3.213712,2.733543,4.429894,3.70704,3.608046,2.913475,2.674548,3.21988,1.654688,1.225528
2,6.069281,5.746724,5.241623,4.503539,6.150415,5.351885,4.313432,2.876343,4.05696,4.942735,...,4.210724,4.839167,3.281057,5.602248,5.394762,5.273412,2.706662,3.450992,2.888514,2.491245
3,7.604465,9.902568,5.151183,6.706818,7.189224,7.277782,6.2569,2.989461,5.743333,6.160741,...,6.047608,5.063769,6.343282,6.995584,6.545068,5.778267,5.364867,5.176726,3.736625,3.483445
4,6.349193,7.703877,4.773705,5.888851,7.190732,6.201897,5.458743,2.115626,4.403845,5.072792,...,5.075056,4.688004,5.436282,7.88778,6.09914,4.711411,4.281257,4.531069,2.765506,2.996892
5,4.788271,4.277224,3.017914,4.918998,5.275575,4.521624,4.5958,1.543686,3.181172,2.908593,...,5.570172,5.027438,3.377301,4.59936,5.143477,5.397762,2.934463,2.670879,2.712019,1.840912
6,3.728169,4.428463,3.020306,3.091806,4.601205,3.741062,3.754669,1.743823,2.999665,2.155294,...,3.619874,3.033892,4.170179,4.387602,4.281016,3.418801,3.57943,3.362475,1.980857,1.903859
7,2.485831,2.491964,3.123583,1.220706,2.476729,1.459165,1.287098,0.635384,1.591328,1.235657,...,2.119257,1.296225,2.666702,2.148987,2.304873,1.767338,2.090425,1.938429,1.406396,0.53359


In [36]:
import math
topic_category_dict=df.idxmax(axis=1).to_dict() ##dictionary with all topics with their corresponding categories

topic_category_correlation_dict=df.max(axis=1).to_dict() ##dictionary with all topics with their correlation to their categories

thresh=sum(topic_category_correlation_dict.values())/len(topic_category_correlation_dict)   

print(thresh)


for key, value in topic_category_correlation_dict.items():
    if topic_category_correlation_dict.get(key)<math.floor(thresh):
        topic_category_dict[key]='Unmarked'

print(topic_category_dict)

npr['Category']= npr['Topic'].map(topic_category_dict)
print(npr)

npr['Category'].value_counts(dropna=False)

npr_remaining=npr[npr['Category']=='Unmarked']

npr_remaining.to_csv('npr_remaining.csv')




6.554836190654896
{0: 'School', 1: 'Unmarked', 2: 'Health', 3: 'Work', 4: 'Family', 5: 'Politics', 6: 'Unmarked', 7: 'Unmarked'}
                                               Article  Topic  Category
0    Older women who look on the bright side of lif...     12       NaN
1    In Bangladesh, a new report finds, impoverishe...      3      Work
2    When he first moved to Miami, Waltter Teruel s...     12       NaN
3    When ATT, a leading Internet provider, propose...      3      Work
4      Donald Trump is on a tour of battleground st...      3      Work
..                                                 ...    ...       ...
504  The number of law enforcement officers shot an...      0    School
505    Trump is busy these days with victory tours,...      3      Work
506  It’s always interesting for the Goats and Soda...      4    Family
507  The election of Donald Trump was a surprise to...      5  Politics
508  Voters in the English city of Sunderland did s...      4    Family

[509 r

In [None]:


#list_categories=list(topic_category_dict.values())

##Firstly lets check for any duplicates within the topic's categories and reassign these to next best option
#run loop on dictionary with names of category 
#find similar ones, compare their correlations
#let the higher correlated one keep its category 

#all_distinct_categories= False

#rev_multidict = {}
#for key, value in topic_category_dict.items():
#    rev_multidict.setdefault(value, set()).add(key) 

#rev_multidict.update((k,list(v)) for k,v in rev_multidict.items()) 

#rev_multidict2= {}
#for key,value in rev_multidict.items(): rev_multidict2[key] = [topic_category_correlation_dict.get(item) for item in value]

#print(rev_multidict2)
#{k: [item: topic_category_correlation_dict.get(item) for item in v ] for k, v in rev_multidict2.items()}    
    
#while !all_distinct_categories:
#if :
#    all_distinct_categories=True    
#else:
 #   all_distinct_categories=False
  #  rev_multidict = {}
   # for key, value in topic_category_dict.items():
    #    rev_multidict.setdefault(value, set()).add(key) 

    #repeated_values=[values for key, values in rev_multidict.items() if len(values) > 1]
    #for set1 in repeated_values:
     #   temp_list= []
      #  for value in set1:
       #     temp_list.append(topic_category_correlation_dict.get(value))

#for key, value in rev_multidict:
 #   for topic in value:
  #      if topic_category_correlation_dict.get(topic)
        
   #     dfnlargest(2).values[-1]
    #    if df.get_value(topic, key)  
            
        
        
    

#print(rev_multidict)



##Next we can check whether the correlation is above a certain number,
#if it is we keep the cateogry otherwise those emails will be unmarked



#print(df.idxmax(axis=1))
#print(this_dictionary)
#print(df.max(axis=1))

In [23]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.453472005712328

Coherence Score:  0.37687023913139384


In [None]:
#from itertools import chain
#tokenized_list = list(chain(*data_lemmatized))

#from sklearn.feature_extraction.text import CountVectorizer
#cv =CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
#dtm = cv.fit_transform(tokenized_list)

#corpus = gensim.matutils.Sparse2Corpus(dtm, documents_columns=False)
#id_map = dict((v, k) for k, v in cv.vocabulary_.items())

#lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           #id2word=id_map,
                                           #num_topics=5, 
                                           #random_state=110,
                                           #update_every=1,
                                           #chunksize=100,
                                           #passes=10,
                                           #alpha='auto',
                                           #per_word_topics=True)
#lda_corpus = lda_model[corpus]
#from pprint import pprint
#pprint(lda_model.print_topics())



In [None]:
#what I should still be doing is comparing the highest probability that a document belongs to a topic
#with some kind of a threshold. If the document's probability is below this threshold it will be categorized with no category


#from itertools import chain
#lda_corpus = lda_model[corpus]
#scores = list(chain(*[[score for topic_id,score in topic] \
                      #for topic in [doc for doc in lda_corpus]]))
#scores = []
#for doc in lda_corpus:
 #   for topic in doc:
  #      for topic_id, score in topic:
   #         scores.append(score)
#threshold = sum(scores)/len(scores)
#threshold = sum(scores)/len(scores)
