# Extracting keywords from Congres

## Class to build a customize stopword list

In [None]:
# Generate a list of meaningless words from a text
from __future__ import division

import pickle
import re
import pattern
import nltk
from pattern.es import tag

class StopListGenerator:

    def __init__(self):
        self._dict_noun = self._get_dict_nouns()
        self._dict_adj  = self._get_dict_adjs()
        
    def generate(self, text):
        text = ' '.join(nltk.wordpunct_tokenize(text))
        tokens_tag = tag(text.lower())
        stoplist = []
        for word, pos in tokens_tag:
            if pos.startswith("JJ") or pos.startswith("NN"):
                pass
            else:
                stoplist.append(word)
        # Check if any from the stoplist is NN or JJ dictionary
        l = [w for w in stoplist if w in self._dict_noun or w in self._dict_adj]
        l = set(l)
        
        return list(set(stoplist) - l)
    
    def cleaning_text(self, text):
        return re.sub(r'[\xb0\xbb\xaa:$%#€&!¡?¿\'()\-"+]+|(,/)|(\.,)|(\.\.)+|(\.\.\.)+|(\d+\.\d*)|(\d*\.\d+)|(\d+\.\d+)|(\d)',' ',text)
            
    def _get_dict_nouns(self):
        p = '/Users/jccan/Dropbox/projects/SpanishPoliticsDebate/data/pickle/nam.pickle'
        #p = 'C:\\Users\\canofran\\Dropbox\\projects\\SpanishPoliticsDebate\\data\\pickle\\nam.pickle'
        f = open(p,'rb')
        return pickle.load(f)
    
    def _get_dict_adjs(self):
        p = '/Users/jccan/Dropbox/projects/SpanishPoliticsDebate/data/pickle/adj.pickle'
        #p = 'C:\\Users\\canofran\\Dropbox\\projects\\SpanishPoliticsDebate\\data\\pickle\\adj.pickle'
        f = open(p,'rb')
        return pickle.load(f)
    
def test(texto):
    SLG = StopListGenerator()
    stoplist = SLG.generate(texto)   
    print stoplist
    
if '__name__' == '__main__':
    test('Esto es una prueba.')

## Class to analyze text from corpus

In [None]:
#-*- coding: utf-8 -*-
# Analyze a text to give statistics
from __future__ import division

import string
import pattern
import nltk
from pattern.es import tag, tokenize
from pattern.vector import Document

def isNotPunct(word):
    return len(word) > 1 or word not in string.punctuation

class TextAnalyzer:
    
    def __init__(self, text):
        self.text     = text
        self.numwords = self._calculate_num_words()
        self.numsents = self._calculate_num_sentences()
        self.numlines = self._calculate_num_lines()
    def __str__(self): 
        return str ("Número de palabras: " + str(self.numwords) + "\n" +
                    "Número de sentencias: " + str(self.numsents) + "\n" +
                    "Número de líneas: " + str(self.numlines))
 
    def _calculate_num_words(self):
        tokens = tag(' '.join(nltk.wordpunct_tokenize(self.text)))
        words = filter(isNotPunct, [w for w, _ in tokens] )
        return len(words)
    
    def _calculate_num_sentences(self):
        return len(tokenize(self.text))
    
    def _calculate_num_lines(self):
        return len(self.text.split('\n'))
    
    # TODO
    def _calculate_num_vocab(self):
        return len(set(self.tokens))

    # TODO
    def _calculate_ratio_vocab(self):
        return 0
    
    # TODO
    def tfidf(self,top, stopwords):
        texto = self.text
        #patt = '\\b'
        #for w in stopwords:
        #    patt = patt+w+"\\b"+"|\\b"
        #print patt[:-3]
        #texto = re.sub(patt[:-3],' ', texto)
        #print texto
        d = Document(texto, exclude = stopwords, threshold = 1.0)
        return d.keywords(top) # format [(weight, word),...]
     
                
def test(texto):
    AT = TextAnalyzer(texto)
    print AT   

if '__name__' == '__main__':
    test('Esto es una prueba.')

## Implementation of RAKE (Rapid Automatic Keyword Extraction) algorithm

In [None]:
# Adapted from: http://sujitpal.blogspot.com.es/2013/03/implementing-rake-algorithm-with-nltk.html
# Adapted from: github.com/aneesha/RAKE/rake.py

from __future__ import division

import operator
import nltk
import string

def isPunct(word):
    return len(word) == 1 and word in string.punctuation

def isNumeric(word):
    try:
        float(word) if '.' in word else int(word)
        return True
    except ValueError:
        return False

class RakeKeywordExtractor:
    
    def __init__(self, stopwords=[]):
        self.stopwords = set(stopwords)
        self.top_fraction = 1 # consider top third candidate keywords by score

    def _generate_candidate_keywords(self, sentences):
        phrase_list = []
        for sentence in sentences:
            words = map(lambda x: "|" if x in self.stopwords else x,
                        nltk.wordpunct_tokenize(sentence.lower()))
            phrase = []
            for word in words:
                if word == "|" or isPunct(word):
                    if len(phrase) > 0:
                        phrase_list.append(phrase)
                        phrase = []
                else:
                    phrase.append(word)
        return phrase_list

    def _calculate_word_scores(self, phrase_list):
        word_freq = nltk.FreqDist()
        word_degree = nltk.FreqDist()
        for phrase in phrase_list:
            degree = len(filter(lambda x: not isNumeric(x), phrase)) - 1
            for word in phrase:
                word_freq[word] +=1 
                word_degree[word] += degree # other words
        for word in word_freq.keys():
            word_degree[word] = word_degree[word] + word_freq[word] # itself
            # word score = deg(w) / freq(w)
        word_scores = {}
        for word in word_freq.keys():
            word_scores[word] = word_degree[word] / word_freq[word]
        return word_scores

    def _calculate_phrase_scores(self, phrase_list, word_scores):
        phrase_scores = {}
        for phrase in phrase_list:
            phrase_score = 0
            for word in phrase:
                phrase_score += word_scores[word]
            phrase_scores[" ".join(phrase)] = phrase_score
        return phrase_scores
    
    def extract(self, text, incl_scores=False):
        sentences = nltk.sent_tokenize(text)
        phrase_list = self._generate_candidate_keywords(sentences)
        word_scores = self._calculate_word_scores(phrase_list)
        phrase_scores = self._calculate_phrase_scores(
                                               phrase_list, word_scores)
        sorted_phrase_scores = sorted(phrase_scores.iteritems(),
                               key=operator.itemgetter(1), reverse=True)
        n_phrases = len(sorted_phrase_scores)
        if incl_scores:
            return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)]
        else:
            return map(lambda x: x[0],
                sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])

## Get a static stopword list from a pickle file

In [None]:
# List 
import pickle

p = '/Users/jccan/Dropbox/projects/SpanishPoliticsDebate/data/pickle/stopwords_list.pickle'
#p = 'C:\Users\canofran\Dropbox\projects\SpanishPoliticsDebate\data\pickle\stopwords_list.pickle'
f = open(p,'rb')
stopwords = pickle.load(f)
stopwords = [unicode(w,encoding='utf-8') for w in stopwords]
stpw = [u'señor', u'señores', u'señora', u'señoras', u'gobierno', u'presidente', u'presidenta',
        u'gracias', u'aplausos', u'ministro',
        u'ministra', u'ministros', u'programa', u'num_exp', u'vicepresidenta', u'partido', 
        u'diputado', u'diputada', u'euros', u'ministerio',u'usted',u'ustedes',u'señoría', u'señorías', 
        u'parlamentario', u'parlamentarios', u'grupo', u'grupos',
        u'heredia', u'hernando',u'pérez', u'rubalcaba',u'saura', u'giménez',
        u'socialista', u'popular',u'año',u'años', u'cámara',u'montón',u'millón', u'millones',u'número',u'números',
        u'sector', u'sectores', u'rumores', u'tema']

stopwords.extend(stpw)

## Process of keyword extraction

In [None]:
import pymongo

# Create the connection to MongoDB
try:
    connection=pymongo.MongoClient()
    print "Connection to Mongo Daemon successful!!!"
except pymongo.errors.ConnectionFailure, e:
    print "Could not connect to MongoDB: %s" % e
    # Obtenim la BD del Congrés
db = connection['congres']
print "Collections : ", db.collection_names()

## Start Update process

In [None]:
# Create a StopListGenerator object
SLG = StopListGenerator()

In [None]:
# UPDATE MONGODB

import datetime
from time import time

# Col.lecció de documents a la BD
t0 = time()
doc_col = db['document']

doc_start_date = "01/09/2000"
d_doc_start_date = datetime.datetime.strptime(doc_start_date, "%d/%m/%Y")
doc_end_date = "11/03/2015"
d_doc_end_date = datetime.datetime.strptime(doc_end_date, "%d/%m/%Y")

result = doc_col.find({'date': {'$gte': d_doc_start_date, '$lte': d_doc_end_date}}).sort('date',1)

def getkeywords(texto):
    texto = SLG.cleaning_text(texto)
    stoplist = SLG.generate(texto)
    rake = RakeKeywordExtractor(stopwords = stopwords + stoplist)
    keywords = rake.extract(texto, incl_scores=True)
    return keywords

total_keywords = []
print "Number of documents retrieved %d" % result.count()
print '=' * 40

if result.count() > 0 :
    for i, doc in enumerate(result):
        print "Num doc: %d" %i 
        print "Description: ", doc['description']
        for dialogo in doc['session_dictionary']:
            #print dialogo['question']
            texto = dialogo['question']
            keywords = getkeywords(texto) # Get its keywords
            total_keywords.extend(keywords)
            #print "número de keywords encontrados: "  + str(len(keywords))
            kws=[]
            for kw in keywords:
                kws.append((kw[0],kw[1]))
            #print kws
            
            dialogo["keywords"] = kws
            #print "=" * 20
            #print dialogo
            #print "=" * 20
            for i, intervencion in enumerate(dialogo['intervention_dictionary']):
                #print "Intervenvión: " + str(i)
                #print "-" * 20
                texto = intervencion['text']
                keywords = getkeywords(texto)
                total_keywords.extend(keywords)
                #print "número de keywords encontrados: "  + str(len(keywords))
                kws=[]
                for kw in keywords:
                    if kw[1] > 1.0:
                        kws.append((kw[0],kw[1]))
                #print kws
                intervencion["keywords"] = kws
                #print intervencion
        print '*' * 20
        print "Actualizo documento"
        print '*' * 20
        doc_col.update({"_id":doc["_id"]}, {"$set": {"session_dictionary": doc['session_dictionary']}})
        
print("done in %fs" % (time() - t0))

## Update simulation

In [None]:
# No update mongodb
import pandas as pd
import datetime
import time

# Col.lecció de documents a la BD
doc_col = db['document']

t0 = time.time()
doc_start_date = "01/09/2000"
d_doc_start_date = datetime.datetime.strptime(doc_start_date, "%d/%m/%Y")
doc_end_date = "11/03/2015"
d_doc_end_date = datetime.datetime.strptime(doc_end_date, "%d/%m/%Y")

result = doc_col.find({'date': {'$gte': d_doc_start_date, '$lte': d_doc_end_date}}).sort('date',1)

def getkeywords(texto):
    texto = SLG.cleaning_text(texto)
    stoplist = SLG.generate(texto)
    rake = RakeKeywordExtractor(stopwords = stopwords + stoplist)
    keywords = rake.extract(texto, incl_scores=True)
    return keywords

#keywords = []
count_doc = []
if result.count() > 0 :
    print "Documentos a tratar: %d" % result.count()
    for doc in result:
        print doc['description']
        for dialogo in doc['session_dictionary']:
            keywords2=[]
            #print dialogo.keys()
            #print dialogo['question']
            texto = dialogo['question']
            #keywords.extend(getkeywords(texto))
            keywords2.extend(getkeywords(texto))
            #print keywords
            for idx, intervencion in enumerate(dialogo['intervention_dictionary']):
                #print "Intervenvión: " + str(idx)
                #print "-" * 20
                #print intervencion.keys()
                #print intervencion["keywords"]
                texto = intervencion["text"]
                #keywords.extend(getkeywords(texto))
                keywords2.extend(getkeywords(texto))
                #print keywords
            count_doc.append([idx, keywords2])
        #print "="*80
        
#df = pd.DataFrame(keywords)
df2 = pd.DataFrame(count_doc, columns=["NumInterventions", "ListKws"])
# All keywords
df2.to_pickle('./files/kw_per_doc.pkl')

#df.to_csv('kws_sim.csv', encoding='utf-8')
print "*** Fin del proceso ***"
print("done in %fs" % (time.time() - t0))

In [None]:
df2

In [None]:
l = df2['kw'][0]

In [None]:
kw = l[0][0]
score = l[0][1]
print kw ,"->", score

In [None]:
# Check strange characters in the extracted keywords
print df[df[0].str.contains(r'\b[a-zA-Z\xe1\xe9\xed\xf3\xfa\xfc\xc1\xc9\xcd\xd3\xda\xdc\xf1]+\b') == False]

In [None]:
len(df[df[1] == 1])

In [None]:
print len(keywords)

In [None]:
df.to_pickle('./files/allkeywords.pkl')

In [None]:
allkw = pd.read_pickle('./files/allkeywords.pkl')

In [None]:
len(set(allkw[allkw[1]>4.0][0].values.tolist()))

In [None]:
# Check keywords from documents

import datetime
import time
import re
import pandas as pd
from nltk import wordpunct_tokenize


# Col.lecció de documents a la BD
doc_col = db['document']

t0 = time.time()

doc_start_date = "01/09/2000"
d_doc_start_date = datetime.datetime.strptime(doc_start_date, "%d/%m/%Y")
doc_end_date = "11/03/2015"
d_doc_end_date = datetime.datetime.strptime(doc_end_date, "%d/%m/%Y")

result = doc_col.find({'date': {'$gte': d_doc_start_date, '$lte': d_doc_end_date}}).sort('date',1)

keyws=[]
num_sesiones = 0
ndoc_general = 0
if result.count() > 0 :
    for doc in result:
        #print doc['description']
        ndoc_interno = 0
        for dialogo in doc['session_dictionary']:
            for ele in dialogo['keywords']:
                keyws.append((ele[0], ele[1]))                                                
            for intervencion in dialogo['intervention_dictionary']:
                for ele in intervencion['keywords']:
                    keyws.append((ele[0], ele[1])) 

df = pd.DataFrame(keyws)
df.to_csv('kws.csv', encoding='utf-8')

print "Fin del proceso"
print("done in %fs" % (time.time() - t0))

In [None]:
# Number of keywords loaded in mongodb
len(df)

## Check keywords

In [None]:
# Check keywords from documents

import datetime
import pandas as pd
import re
from nltk import wordpunct_tokenize


# Col.lecció de documents a la BD
doc_col = db['document']

doc_start_date = "01/09/2000"
d_doc_start_date = datetime.datetime.strptime(doc_start_date, "%d/%m/%Y")
doc_end_date = "11/03/2015"
d_doc_end_date = datetime.datetime.strptime(doc_end_date, "%d/%m/%Y")

result = doc_col.find({'date': {'$gte': d_doc_start_date, '$lte': d_doc_end_date}}).sort('date',1)

keywords=[]
num_sesiones = 0
ndoc_general = 0
if result.count() > 0 :
    for doc in result:
        print doc['description']
        ndoc_interno = 0
        for dialogo in doc['session_dictionary']:
            texto = dialogo['question']
            nlines = 0
            tokens = wordpunct_tokenize(texto) 
            #          if token not in re.findall(r'[!"#$%&\\\'()*+\,\-\./:;<=>?@[\\\]^_`{|}~\xbf\xbf\xa1]+',texto)]
            #pprint (tokens)
            keywords.append((num_sesiones, ndoc_general, ndoc_interno, 0, len(dialogo['keywords']), 
                             len(tokens), nlines, doc['date']))
            ndoc_interno +=1
            for intervencion in dialogo['intervention_dictionary']:
                texto = intervencion["text"]
                nlines = 0
                tokens = wordpunct_tokenize(texto) 
                #      if token not in re.findall(r'[!"#$%&\\\'()*+\,\-\./:;<=>?@[\\\]^_`{|}~\xbf\xbf\xa1]+',texto)]
                #pprint (tokens)
                keywords.append((num_sesiones, ndoc_general, ndoc_interno, 1, len(intervencion['keywords']), 
                                 len(tokens), nlines ,doc['date']))
                ndoc_interno +=1
            ndoc_general +=1
        num_sesiones +=1

df = pd.DataFrame(keywords,columns=['ndiario', 'ndoc_g', 'ndoc_i', 'tipo','nkws','nwords', 'nlines', 'data'])

# uncomment if you want to save as a cvs file
# df.to_csv('docs_stat.csv')

print "Fin del proceso"