# Program Setup

### Imports

In [1]:
from __future__ import division

import os
import time
import re
import ast
import json
import itertools
from datetime import datetime
from time import sleep
from collections import Counter
import pickle as cPickle

import pandas as pd
import numpy as np

import spacy
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

### Set Logger

In [2]:
import logging

logger = logging.getLogger('./log/KeywordsFinder.log')
logger.setLevel(20)
handler = logging.FileHandler(filename='./log/Taxonomy_Generation.log', mode='w', encoding='utf-8')
# handler.setLevel(20)
formatter = logging.Formatter('%(asctime)s : %(filename)s : %(lineno)d : %(levelname)s : %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

### Read Config 

In [3]:
class _Config:
    def __init__(self):
        self.config = dict()
        with open('./config/config.txt') as f:
            self.config = json.load(f)
        
    def __getattr__(self, name):
        return self.config[name]

config = _Config()
global corpus_file_path,base_dir_path, records, stopwords_file, spacy_nlp_model_name, min_df, non_spacy_words_excel_file_path, best_fit_analysis_excel_file_path, lower_factor_vocab,higher_factor_vocab, max_iter_min, max_iter_max, max_iter_incr, final_taxonomy_file_path, timestamp, directory_path, best_fit_tryouts_folder_path, filter_ner

corpus_file_path = config.__getattr__('corpus_file_path')
base_dir_path = config.__getattr__('base_dir_path')
records = config.__getattr__('records')
stopwords_file_path = config.__getattr__('stopwords_file_path')
spacy_nlp_model_name = config.__getattr__('spacy_nlp_model_name')
min_df = config.__getattr__('min_df')
filter_ner = config.__getattr__('filter_ner').split(',')

### Set Global Variables

In [4]:
nlp = spacy.load(spacy_nlp_model_name)
stop_words = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()

roman_characters = re.compile(u'^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')
alpha_characters = re.compile(u'[^a-z]+')

### Load Stopwords list

In [5]:
# Read stopwords from file
additional_stopwords = None
with open(stopwords_file_path) as f:
    additional_stopwords = f.readlines()
additional_stopwords = [x.strip() for x in additional_stopwords]
stop_words.extend(additional_stopwords)
stop_words = list(set(stop_words))

### Load News data

In [6]:
_dataset = pd.read_excel(corpus_file_path, nrows = records)
#_dataset = pd.read_csv(corpus_file_path, nrows = records)

logger.info("Number of Records - {}".format(len(_dataset)))
_dataset = _dataset.drop(['Unnamed: 0'], axis=1)
_dataset['TEXT'] = _dataset[u'article_news_title']+ '. '+_dataset[u'article_news_abstract'] + '. '+_dataset[u'article_news_body']
_dataset = _dataset[['id','TEXT']]
_dataset['TEXT'] = _dataset.TEXT.apply(str)

_dataset['sentences'] = _dataset['TEXT'].apply(sent_tokenize)

news = _dataset.sentences.tolist()
sentences = list(itertools.chain.from_iterable(news))
words = []
for sentence in sentences:
    words.extend(sentence.split())

print("No. of news - {}".format(len(news)))
print("No. of sentences - {}".format(len(sentences)))
print("No. of words - {}".format(len(words)))

No. of news - 5
No. of sentences - 64
No. of words - 2132


### Extract Noun Chunks

In [7]:
_dataset['noun_chunks'] = _dataset['sentences'].apply(lambda x: list(itertools.chain.from_iterable(map(lambda y: list(nlp(y).noun_chunks),x))))
chnks = list(itertools.chain.from_iterable(_dataset['noun_chunks']))

print("No. of Noun Chunks extracted - {}".format(len(chnks)))
_dataset.to_excel("Extracted_Noun_Chunks.xlsx")
print("Extracted_Noun_Chunks.xlsx")

No. of Noun Chunks extracted - 599
Extracted_Noun_Chunks.xlsx


# Data Cleaning

#### 1. Bring words to root form
#### 2. Remove stop words
#### 3. Remove Roman numbers
#### 4. Remove words which are <= 2 characters

In [8]:
def is_roman(word):
    if roman_characters.search(word.upper()):
        return True
    return False

def cleaning_text(msg):
    msg = str(msg).lower()
    msg = alpha_characters.sub(' ', msg)
    msg = [lemmatized_word for lemmatized_word in
               [wordnet_lemmatizer.lemmatize(word, 'n') for word in msg.split() 
                if (word not in stop_words) and (not is_roman(word))
               ] 
           if (lemmatized_word not in stop_words)
           and (not is_roman(lemmatized_word))
           and (len(lemmatized_word) > 2) 
          ]
    msg=' '.join(list(filter(bool,msg)))
    return msg

def cleaning_list_of_text(list_of_msg):
    return list(cleaning_text(x) for x in list_of_msg if len(cleaning_text(x)) > 0)

_dataset['noun_chunks_cleaned'] = _dataset['noun_chunks'].apply(cleaning_list_of_text)
_dataset.to_excel("Cleaned_Noun_Chunks.xlsx")

cleaned_chunks = _dataset.noun_chunks_cleaned.tolist()
cleaned_chunks = list((itertools.chain.from_iterable(cleaned_chunks)))

print("No. of Noun chunks after cleaning - {}".format(len(cleaned_chunks)))
print("Cleaned_Noun_Chunks.xlsx")
logger.info("No. of cleaned Noun Phrases - {}".format(len(cleaned_chunks)))

No. of Noun chunks after cleaning - 503
Cleaned_Noun_Chunks.xlsx


# Data Filtering

## N-gram based filtering

In [9]:
filtered_cleaned_chunks = filter(lambda x: len(x.split()) in [2,3,4] or (len(x.split()) == 1 and len(x) > 8), cleaned_chunks)
term_2_count_map = Counter(filtered_cleaned_chunks)

print("No. of Noun chunks after N-gram filter - {}".format(sum(term_2_count_map.values())))
print("No. of unique Noun chunks after N-gram filter - {}".format(len(term_2_count_map.keys())))

tf = pd.DataFrame({'term_phrase': list(term_2_count_map.keys()), 'count':list(term_2_count_map.values())})
logger.info("Initial Noun Phrase corpus size: {}".format(tf.shape[0]))

No. of Noun chunks after N-gram filter - 263
No. of unique Noun chunks after N-gram filter - 202


## Term frequency based filtering

In [10]:
tf = tf[tf['count'] >= min_df]

print("No. of Noun chunks after TF filter - {}".format(tf.shape[0]))
logger.info("Noun Phrase corpus size after min_df filter({0}) : {1}".format(3, tf.shape[0]))

No. of Noun chunks after TF filter - 12


## NER based filtering

In [11]:
def is_ner_to_be_removed(input_text):
    ner_doc = nlp(input_text)
    for X in ner_doc.ents:
        ner_tag = str(X.label_)
        word = str(X.text)
        if ner_tag in filter_ner:
            return True
        
tf['is_ner_to_be_removed'] = tf['term_phrase'].apply(is_ner_to_be_removed)
examples = tf[tf['is_ner_to_be_removed'] == True]['term_phrase'].tolist()[0:10]

tf = tf[tf['is_ner_to_be_removed'] != True]

print("No. of Noun chunks after NER filter - {}".format(tf.shape[0]))
print("Some of the chunks filtered ", examples)
#tf.to_excel("tf.xlsx")

No. of Noun chunks after NER filter - 11
Some of the chunks filtered  ['republican']


## POS based filtering

Remove Noun chunks which does not contain any noun term. 

They are least likely to form any topic.

For ex. "additional resource", "executive action", "local medium"

### Create a data frame with terms and their POS tags

In [12]:
temp_term_list = np.empty([0,2])
filtered_term_list = tf['term_phrase'].tolist()
documents = _dataset.TEXT.tolist()
for document in documents:
    _list = []
    doc = nlp(document)
    for t in doc:
        lemma_text = wordnet_lemmatizer.lemmatize(t.text.lower(), 'n')
        if (lemma_text not in stop_words) and (not is_roman(lemma_text)) and (len(lemma_text) > 2) and (t.text.lower() in filtered_term_list):
            _list.append([lemma_text, t.pos_])#,t.tag_])
    if len(_list) > 0:
        temp_term_list = np.vstack([temp_term_list,_list])
temp_term_list
term_df = pd.DataFrame(data=temp_term_list, columns=['term','pos']) #,'tag'])
logger.info("Tokens and corresponding POS tags Dataframe shape - {}".format(term_df.shape))
logger.info(term_df.head())
#term_df.to_excel("term_df_1.xlsx")

unique_terms = list(term_df.term.unique())
list_of_term_dict = []
for term in unique_terms:
    word_df = term_df[term_df.term == term]
    word_df = word_df.groupby(by=['pos'],).count()
    _dict = {pos_tag: count.term for pos_tag, count in word_df.iterrows()}
    _dict['term'] = term
    list_of_term_dict.append(_dict)
# list_of_term_dict[:5]

pos_df = pd.DataFrame.from_dict(list_of_term_dict)
# re-ordering columns
cols = list(pos_df)
cols.insert(0, cols.pop(cols.index('term')))
pos_df = pos_df.loc[:, cols]

logger.info("\n Tokens and corresponding POS tags with frequenct Dataframe shape - {}".format(pos_df.shape))
pos_df.to_excel("pos_df.xlsx")
print(pos_df.head())
print("pos_df.xlsx")

          term  NOUN  PROPN
0    president   6.0    9.0
1  immigration  16.0    NaN
2   havenstein   NaN    4.0
pos_df.xlsx


### Utility functions to check if term is noun

In [None]:
def get_term_pos(term):

    df = pos_df[pos_df.term == term].drop(['term'], axis=1).T
    if df.shape[1] == 1:
        df.columns = ['count']
        df = df.fillna(0).astype(int).sort_values(by=['count'], ascending=False)
        return df
    return None

def is_term_noun_in_corpus(term):
    noun_set = set(['NOUN','PROPN'])
    df = get_term_pos(term)
    if df is not None:
        top_term_pos = df[df['count'] != 0].head(1)['count'] # consider only highest pos
        term_pos_set = set(top_term_pos.index.tolist()) 
        return True if len(term_pos_set.intersection(noun_set)) > 0 else False
    else:
        return None

def is_phrase_term_noun_in_corpus(phrase):
    terms = phrase.split()
    terms = map(is_term_noun_in_corpus, terms)
    return any(terms)


### Remove keywords based on POS

In [None]:
tf['is_noun'] = tf['term_phrase'].apply(is_phrase_term_noun_in_corpus)
tf = tf[tf['is_noun'] == True]

# Sort the results based on statistics
tf = tf.sort_values(['count'], ascending=[0])
tf = tf[['term_phrase','count']].reset_index()
tf = tf.drop(['index'], axis=1)

tf.to_excel("keywords.xlsx")
logger.info("Noun Phrase corpus size after noun in chunk filter - {}".format(tf.shape[0]))
print("No. of Noun chunks after POS filter - {}".format(tf.shape[0]))
print("keywords.xlsx")