# Projecte CIS-Congres

Després dels processos de scraping del Congrés i del CIS, i del procés de clusterització, en aquest notebook tractem els resultats per generar dades per un gràfic que visualitzarà aquests resultats: <b>Topic Evolution</b> (inspirat en http://bost.ocks.org/mike/nations/ ).

# Scraping

### Mètodes i utilitats

In [1]:
from urllib2 import Request, urlopen, URLError

# Mètode per obtenir una pàgina web
def get_html_page(url_page):
    req = Request(url_page)
    try:
        html_page = urlopen(req)
    except URLError as e:
        if hasattr(e, 'reason'):
            print 'We failed to reach a server. URL: ' + url_page
            print 'Reason: ', e.reason
        elif hasattr(e, 'code'):
            print 'The server couldn\'t fulfill the request. URL: ' + url_page
            print 'Error code: ', e.code
    else:
        # everything is fine
        return html_page

In [2]:
# http://stackoverflow.com/questions/7100125/storing-python-dictionaries

import json
from bson import json_util
import yaml

# http://api.mongodb.org/python/1.10.1/api/bson/json_util.html
# Mètodes per grabar y carregar fitxers en format json
def save_dict_json(dict, filename):
    with open(filename, 'wb') as fp:
        json.dump(dict, fp, default=json_util.default)

def load_dict_json(filename):
    with open(filename, 'rb') as fp:
        return json.load(fp, object_hook=json_util.object_hook)

# http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
def load_dict_yaml(filename):
    with open(filename, 'rb') as fp:
        return yaml.load(fp)
    
# Mètode per grabar un fitxer de texte
def save_text_file(text, file_name, encoding = 'utf-8'):
    with open(file_name, "w") as text_file:
        if encoding:
            text_file.write(text.encode(encoding))
        else:
            text_file.write(text)
            
# Mètode per grabar un fitxer de texte
def save_list_text_file(text_list, file_name):
    with open(file_name, "w") as text_file:
        for item in text_list:
            text_file.write("%s\n" % item)
        
# Mètode per carregar un fitxer de texte
def load_text_file(file_name, encoding = 'utf-8'):
    with open(file_name, "r") as text_file:
        if encoding:
            return text_file.read().decode(encoding)
        else:
            return text_file.read()
        
# Mètode per eliminar caracters incorrectes als noms de fitxers (potser millor utilitzar: https://pypi.python.org/pypi/goldfinch/0.4)
invalid_filename_chars_windows = '\/:*?"<>|'
def del_invalid_chars(value, deletechars):
    for c in deletechars:
        value = value.replace(c,'')
    return value;

import datetime

# Mètode per cambiar de format de data
def format_date(strFecha, format_1, format_2):
    dtDate = datetime.datetime.strptime(strFecha, format_1)
    return dtDate.strftime(format_2)

In [3]:
def get_text_between( s, first, last, include_limits = False, first_ocurrences = True ):
    try:
        if first_ocurrences:
            if include_limits:
                start = s.index( first )
                end = s.rindex( last, start ) + len(last)
                return s[start:end]
            else:
                start = s.index( first ) + len( first )
                end = s.rindex( last, start ) 
            return s[start:end]
        else: 
            # look for last ocurrences of first and last
            if include_limits:
                start = s.rindex( first )
                end = s.index( last ) + len(last)
                return s[start:end]
            else:
                start = s.rindex( first ) + len( first )
                end = s.index( last ) 
            return s[start:end]            
    except ValueError:
        return ""
    
def remove_text_between( s, first, last, include_limits = False ):
    try:
        if include_limits:
            start = s.index( first ) + len( first )
            end = s.index( last, start ) 
            return s[:start] + s[end:] 
        else:
            start = s.index( first )
            end = s.index( last, start ) + len(last)
            return s[:start] + s[end:]
    except ValueError:
        return ""
    
def remove_spaces_and_newline(s):
    return re.sub(r'\s+', ' ', s.replace('\n',''))

In [4]:
# http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python/22776#22776
# http://blog.radevic.com/2012/07/python-download-url-to-file-with.html

import urllib2
import sys

# Mètode per descarregar fitxers d'una url
def download_file(url, file_name, path = "", verbose = False):
    u = urllib2.urlopen(url)
    f = open(path+file_name, 'wb')
    meta = u.info()
    file_size = int(meta.getheaders("Content-Length")[0])
    if verbose:
        print("Downloading: {0} Bytes: {1}".format(url, file_size))

    file_size_dl = 0
    block_sz = 8192
    while True:
        buffer = u.read(block_sz)
        if not buffer:
            break

        file_size_dl += len(buffer)
        f.write(buffer)
        
        if verbose:
            p = float(file_size_dl) / file_size
            status = "\r{0} bytes  [{1:.2%}]".format(file_size_dl, p)
            status = status + chr(8)*(len(status)+1)
            sys.stdout.write(status)
            sys.stdout.flush()

            # done = int(50 * file_size_dl / file_size)
            # sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )    
            # sys.stdout.flush()

    f.close()
    print("\n")

# url = 'http://www.pp.es/sites/default/files/documentos/pr_den_2015.pdf'
# file_name = url.split('/')[-1]    
# download_file(url, file_name,'../data/pp/')

In [5]:
# http://stackoverflow.com/questions/26494211/extracting-text-from-a-pdf-file-using-pdfminer-in-python

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

def convert_pdf_from_stream_to_txt(gridFS_result, codec = 'utf-8'):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    
    '''
    output=open("CIS/docs/temp_file_001","wb")
    output.write(gridFS_result.read())
    output.close()
    
    fp = file("CIS/docs/temp_file_001", 'rb')
    '''
    fp = gridFS_result
    
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    # fp.close()
    device.close()
    retstr.close()
    
    # http://stackoverflow.com/questions/25315566/unicodedecodeerror-in-nltks-word-tokenize-despite-i-forced-the-encoding
    text = text.decode(codec)
    return text

#### Obrir conexió a la BD 'CIS'

In [6]:
import pymongo

# Create the connection to MongoDB
try:
    connection=pymongo.MongoClient()
    print "Connection to Mongo Daemon successful!!!"
except pymongo.errors.ConnectionFailure, e:
    print "Could not connect to MongoDB: %s" % e
    
# Obtenim la BD del Congrés
db = connection['CIS']
#
print "Collections:", db.collection_names()

Connection to Mongo Daemon successful!!!
Collections: [u'system.indexes', u'barometro_docs', u'barometro_docs.chunks', u'barometro_docs.files', u'barometro_topics']


Check the database status http://127.0.0.1:28017/

#### Tancar la conexió a la BD

In [49]:
connection.close()

[24/04/2015]

A la reunió d'avui em vist que a la següent pàgina del CIS hi ha una taula de tòpics per anys y mesos.  Aquesta taula ens estalvia el scraping anterior.

In [13]:
%%html
http://www.cis.es/cis/export/sites/default/-Archivos/Indicadores/documentos_html/TresProblemas.html

Unnamed: 0,mar15,feb15,ene15,dic14,nov14,oct14,sep14,jul14,jun14,may14,abr14,mar14,feb14,ene14,dic13,nov13,oct13,sep13,jul13,jun13,may13,abr13,mar13,feb13,ene13,dic12,nov12,oct12,sep12,jul12,jun12,may12,abr12,mar12,feb12,ene12,dic11,nov11,oct11,sep11,jul11,jun11,may11,abr11,mar11,feb11,ene11,dic10,nov10,oct10,sep10,jul10,jun10,may10,abr10,mar10,feb10,ene10,dic09,nov09,oct09,sep09,jul09,jun09,may09,abr09,mar09,feb09,ene09,dic08,nov08,oct08,sep08,jul08,jun08,may08,abr08,mar08,feb08,ene08,dic07,nov07,oct07,sep07,jul07,jun07,may07,abr07,mar07,feb07,ene07,dic06,nov06,oct06,sep06,jul06,jun06,may06,abr06,mar06,feb06,ene06,dic05,nov05,oct05,sep05,jul05,jun05,may05,abr05,mar05,feb05,ene05,dic04,nov04,oct04,sep04,jul04,jun04,may04,abr04,mar04,feb04,ene04,dic03,nov03,oct03,sep03,jul03,jun03,may03,abr03,mar03,feb03,ene03,dic02,nov02,oct02,sep02,jul02,jun02,may02,abr02,mar02,feb02,ene02,dic01,nov01,sep01,jul01,jun01,may01,abr01,mar01,feb01,ene01,dic00,nov00,oct00,sep00,mar99,jul98,oct97,abr97,mar97,nov96,mar96,feb96,nov95,sep95,abr95,feb95,ene95,dic94,sep94,sep93,jun88,dic86,sep86,oct85,may85
El paro,80.3,78.6,79.4,75.5,77.0,76.0,75.3,77.0,76.8,80.8,80.3,82.3,81.1,78.5,77.0,77.7,77.4,77.3,80.9,80.5,82.4,80.7,81.6,79.9,81.1,77.1,80.8,77.9,79.3,78.6,77.8,81.7,81.7,84.0,84.0,83.3,82.0,83.0,81.0,80.4,81.3,82.6,84.1,82.8,81.8,83.9,82.4,78.6,79.5,81.1,78.4,78.0,75.9,79.9,79.7,82.9,81.8,82.7,79.0,78.4,73.0,76.4,74.3,73.7,77.3,75.7,76.1,75.2,75.3,72.5,71.5,64.9,62.2,56.1,53.8,52.5,52.0,45.8,44.7,43.8,38.6,40.0,37.4,35.0,36.5,38.5,42.2,37.5,38.1,40.5,37.6,42.1,40.1,40.7,42.1,47.0,43.2,46.3,49.8,52.7,49.0,49.8,46.6,51.5,49.0,53.0,52.0,54.9,53.1,54.5,57.7,56.8,58.2,53.9,57.3,57.1,58.3,56.9,59.3,59.0,56.8,51.7,61.0,60.4,57.9,60.2,57.8,58.4,61.2,56.5,63.6,63.4,56.8,60.0,58.8,61.1,64.3,58.7,60.2,63.0,63.1,65.3,65.7,60.6,58.7,61.9,62.9,57.6,61.0,59.4,59.6,59.5,63.4,57.1,63.3,56.0,59.9,63.1,56.9,59.8,75.9,81.0,85.3,87.6,81.3,83.1,78.7,86.0,77.6,76.3,79.9,76.6,84.9,82.6,80.0,89.6,91.0,92.0,94.1,93.1,94.4
Las drogas,0.3,0.3,0.1,0.2,0.3,0.3,0.3,0.2,0.2,0.1,0.3,0.1,0.2,0.2,0.5,0.1,0.2,0.4,0.3,0.3,0.2,.,0.1,0.3,0.2,0.5,0.3,0.4,0.4,0.2,0.4,0.5,0.4,0.5,0.6,0.6,0.6,0.4,0.6,0.8,0.5,0.7,0.9,0.9,0.6,0.8,0.6,1.4,1.1,1.3,1.0,0.7,0.6,1.4,1.1,0.8,1.0,1.1,1.9,1.2,1.5,1.5,1.7,2.1,1.6,1.7,1.5,1.3,1.5,1.6,2.3,2.4,1.9,2.3,2.0,1.8,1.6,2.9,3.0,2.5,3.5,4.1,4.1,4.1,4.8,5.3,4.8,4.7,3.9,5.5,3.7,6.3,5.2,6.0,4.8,5.8,6.1,5.4,4.8,6.0,6.6,5.4,6.5,5.6,7.5,6.4,6.7,6.5,6.5,6.8,8.0,6.5,8.5,10.0,10.6,9.0,9.5,9.2,9.1,7.2,5.9,5.7,8.9,10.0,10.9,12.0,9.9,12.0,13.0,14.3,9.2,9.1,7.2,9.1,9.8,9.4,9.9,11.8,11.3,13.1,11.3,15.8,17.4,23.2,15.6,14.5,15.4,14.8,14.3,16.1,19.4,18.0,17.5,13.3,14.8,12.3,15.2,14.9,14.6,14.7,25.3,27.5,29.2,28.9,23.5,26.5,19.4,19.2,16.0,16.4,17.6,24.5,18.7,28.7,25.9,35.8,49.0,26.3,10.5,19.3,17.5
La inseguridad ciudadana,2.2,2.2,2.6,3.4,1.7,2.9,3.2,2.5,1.6,2.8,3.1,2.0,2.3,2.3,3.4,1.7,2.6,2.8,2.8,3.0,2.6,2.5,2.8,2.3,2.9,2.8,2.5,3.1,3.1,4.0,3.9,4.5,5.4,6.3,6.6,5.7,5.6,5.8,6.4,7.4,6.7,7.7,7.9,8.0,6.8,9.4,7.3,8.1,8.3,7.3,7.5,8.1,7.3,9.7,9.3,9.2,8.4,8.1,11.8,11.4,9.9,12.8,10.3,12.6,11.4,11.1,13.1,11.0,11.7,11.1,11.4,10.2,11.5,10.0,10.7,12.3,12.2,14.9,18.5,16.4,14.4,14.5,15.7,12.9,13.5,13.6,14.6,16.7,14.9,15.9,18.9,21.1,20.5,16.1,16.0,21.0,24.3,26.3,17.7,16.6,19.1,23.3,21.5,14.4,13.4,12.1,14.6,15.1,11.3,14.2,16.0,16.3,16.7,17.7,17.9,16.6,16.5,18.7,18.1,17.3,15.6,15.7,21.2,23.6,21.4,23.0,27.5,25.4,23.4,29.1,26.1,23.2,18.5,18.5,25.5,17.5,22.0,24.8,22.0,19.1,16.8,22.4,17.7,19.1,21.8,18.1,14.2,14.9,8.3,9.4,13.7,8.9,9.5,9.3,8.0,8.4,9.4,9.2,9.5,10.4,15.8,13.0,14.6,19.5,11.8,15.4,28.6,13.1,14.0,11.1,8.8,12.7,10.3,16.5,10.8,17.3,36.3,30.0,30.9,28.0,32.9
El terrorismo. ETA,0.2,0.8,1.1,0.5,0.1,0.4,0.6,0.5,0.5,0.4,0.7,0.9,1.2,1.5,0.5,0.9,0.3,0.6,0.6,0.4,0.5,0.6,0.4,0.4,0.3,0.2,0.6,0.5,1.1,1.0,0.9,1.2,1.0,1.3,1.7,1.2,2.5,3.7,3.3,3.7,4.8,5.3,9.1,5.7,5.7,5.6,7.4,6.0,9.9,6.9,9.0,5.5,7.0,9.2,12.3,11.1,12.5,17.6,12.9,13.1,12.6,18.3,19.1,13.6,15.5,16.0,19.4,20.6,22.2,28.9,21.8,22.5,18.6,20.3,22.9,31.4,31.4,36.9,31.4,35.1,39.6,29.1,31.7,35.4,44.1,41.7,32.3,36.5,38.6,42.7,45.3,27.8,25.4,18.9,18.0,26.9,22.8,18.5,24.9,28.0,35.0,24.9,26.6,24.1,23.9,34.2,50.4,41.7,39.5,34.0,40.7,46.1,53.3,55.1,42.1,46.6,41.3,44.1,45.4,48.1,63.0,73.4,40.1,34.8,39.7,38.4,38.7,42.9,39.5,43.3,44.5,41.5,37.0,45.0,48.7,43.9,53.3,53.5,64.0,50.2,49.0,51.0,54.2,54.1,55.9,59.5,62.3,65.7,70.6,73.5,56.7,70.9,63.1,70.3,61.8,65.9,77.0,80.1,67.9,65.5,25.8,49.7,33.4,37.2,32.6,20.6,36.4,42.8,17.1,12.2,34.8,10.8,11.3,7.5,13.0,12.6,30.1,38.0,66.5,33.1,37.6
Las infraestructuras,0.2,0.1,0.2,0.1,0.1,0.0,0.0,0.1,0.2,0.0,0.0,0.2,.,0.1,0.1,0.0,.,.,.,0.1,.,.,0.2,0.2,0.4,0.2,0.1,0.5,0.1,0.2,0.4,0.2,0.3,0.3,0.2,0.4,0.2,0.2,0.1,0.4,0.6,0.5,0.4,0.4,0.5,0.4,0.6,0.6,0.2,0.6,0.3,0.6,0.3,0.5,0.5,0.8,0.6,0.6,0.6,0.4,0.5,0.7,0.7,0.4,0.5,0.4,0.6,0.7,0.8,0.8,0.3,0.7,0.8,0.7,1.1,0.7,0.7,0.7,0.7,1.3,1.1,2.0,2.3,1.0,1.1,1.6,1.1,1.5,0.8,0.5,0.6,0.8,0.9,1.6,0.9,1.7,1.0,1.0,1.2,0.6,0.8,0.7,1.2,0.5,0.7,0.8,1.5,0.8,1.1,1.2,0.9,2.2,1.2,1.1,0.9,1.8,1.3,2.4,0.8,0.8,1.0,0.8,1.0,2.5,1.7,1.5,1.4,1.3,1.5,1.9,1.6,1.0,0.8,1.0,1.0,0.8,1.2,1.0,0.8,1.5,0.7,0.9,1.0,1.2,0.8,1.0,0.8,1.0,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,2.7,1.8,0.9,0.3,1.2,0.4
La sanidad,11.8,12.1,10.2,10.6,8.6,11.8,11.5,11.3,11.5,10.4,10.4,10.2,10.8,11.6,10.9,13.0,13.4,10.6,10.3,9.3,10.0,7.9,10.1,10.5,12.1,12.9,11.4,10.1,9.1,10.4,8.6,10.6,8.6,9.3,7.1,8.1,9.3,7.7,7.8,6.1,4.3,4.4,4.1,3.7,3.4,2.9,3.6,3.0,3.0,3.9,3.4,4.0,3.2,3.3,3.2,4.1,4.3,3.7,4.8,5.6,4.8,4.9,4.1,3.8,4.1,4.8,5.3,4.7,4.5,2.9,3.4,4.4,3.6,3.8,4.1,3.6,4.8,4.6,4.4,5.5,4.2,3.7,4.0,3.9,4.6,4.1,5.6,5.0,4.0,4.1,4.2,4.5,5.4,3.9,3.7,4.1,5.0,4.1,4.8,4.8,4.0,5.2,5.8,5.5,6.8,6.4,4.2,4.5,6.0,5.6,5.7,6.5,6.5,6.2,5.8,5.0,4.6,6.2,5.0,4.7,5.3,3.0,6.3,5.6,5.0,6.0,4.7,4.3,3.5,3.6,5.4,5.8,3.9,3.0,3.5,2.9,3.8,3.1,2.7,3.9,3.3,3.4,4.2,3.4,3.6,4.0,4.4,6.2,3.9,3.9,3.9,3.8,3.9,3.2,3.1,4.3,3.2,4.7,5.0,3.9,.,.,.,.,.,.,.,.,.,.,.,.,.,2.7,.,7.5,.,.,.,.,.
La vivienda,1.9,1.1,1.5,1.0,0.9,1.3,1.6,1.6,1.6,1.3,1.7,1.6,1.6,1.5,0.9,2.0,1.4,2.3,1.7,1.9,2.8,2.5,2.0,1.7,3.1,3.9,3.9,2.7,1.8,3.4,3.6,3.5,5.2,4.3,5.7,5.6,5.9,6.0,5.1,4.9,6.7,7.2,4.0,5.5,5.1,4.0,4.4,5.2,6.0,5.3,6.1,6.7,5.3,7.0,6.5,7.9,7.1,9.4,8.5,10.3,7.6,10.1,10.5,11.6,13.0,13.1,11.3,11.6,13.5,15.5,16.6,18.3,18.4,21.5,21.5,24.5,25.6,28.8,28.9,29.1,28.8,32.9,34.9,37.3,36.7,30.8,34.9,32.5,27.6,27.0,29.8,29.8,25.1,30.3,20.9,26.5,24.6,25.9,24.8,18.9,17.7,18.2,16.8,20.5,19.4,23.3,21.2,21.4,22.0,25.5,19.1,20.0,19.6,15.6,20.5,21.9,18.1,18.4,20.0,22.0,22.1,19.4,20.3,18.3,17.3,17.1,16.9,10.9,13.9,12.2,15.3,12.8,7.2,7.3,8.9,7.3,9.6,10.2,3.6,5.2,3.4,3.1,4.9,3.5,2.7,3.8,3.7,3.8,2.3,3.5,4.4,2.7,2.9,2.0,2.2,2.8,2.9,2.5,4.2,2.8,.,.,.,.,.,.,.,.,.,.,.,.,.,.,2.6,5.4,1.7,1.9,3.2,2.2,2.1
Los problemas de índole económica,24.9,24.9,24.5,24.9,25.5,27.0,28.8,28.0,28.4,28.6,28.0,28.2,28.3,30.5,29.7,31.1,32.7,32.5,32.0,32.2,34.9,35.5,34.4,35.4,38.9,39.5,42.7,43.4,49.4,46.5,46.3,51.0,52.8,49.2,52.3,53.7,50.4,48.2,51.3,49.6,49.6,47.0,46.6,47.3,51.3,51.6,53.1,52.0,47.9,47.8,48.2,51.3,53.0,50.9,46.8,45.3,47.8,47.0,47.0,45.9,46.8,48.4,48.5,48.3,48.7,54.1,52.0,54.4,51.9,54.5,57.9,54.7,54.2,59.9,58.3,51.9,48.4,39.4,36.4,39.2,32.9,29.4,24.5,21.8,16.5,15.5,15.5,18.0,15.7,15.7,18.3,15.5,15.6,17.1,15.6,19.0,19.4,17.3,19.7,16.0,15.0,17.2,16.7,17.1,21.5,16.4,17.6,17.4,17.3,15.4,11.7,9.9,11.3,11.2,12.0,14.3,11.5,12.1,11.1,11.9,9.9,7.9,10.6,12.5,11.7,13.1,12.7,10.7,10.5,9.9,8.4,11.1,9.7,9.2,11.0,10.9,12.8,10.4,9.4,8.8,9.2,8.5,8.7,7.6,8.5,10.0,10.5,11.8,10.3,9.5,9.0,7.6,9.2,5.5,6.4,8.2,11.7,12.9,15.1,14.2,8.7,9.6,12.8,10.8,11.1,13.1,20.5,11.7,17.5,20.8,23.3,29.6,26.5,24.3,24.6,35.8,10.1,19.8,23.1,27.3,28.0
Los problemas relacionados con la calidad del empleo,3.4,3.9,3.7,4.9,2.4,4.8,5.0,2.5,4.7,3.0,2.2,2.3,2.3,2.3,1.8,1.3,2.0,2.0,1.5,1.8,1.3,1.4,1.1,1.1,1.6,1.8,1.4,2.6,2.3,2.3,2.7,2.1,2.2,3.6,3.3,2.2,2.5,2.3,2.3,2.7,2.0,3.0,2.3,3.3,2.8,3.1,2.7,1.9,2.3,3.4,4.0,3.2,3.4,2.9,2.4,3.1,3.0,3.2,2.6,2.8,2.7,2.9,2.9,5.0,3.0,2.9,3.1,3.3,3.9,2.7,4.0,5.4,6.4,8.4,8.6,8.7,8.2,9.4,11.0,12.3,12.3,12.2,13.7,13.5,12.7,12.4,9.5,14.5,9.0,9.8,9.8,9.7,9.9,10.7,7.8,8.8,9.3,7.2,10.0,9.3,7.3,8.4,7.9,7.9,8.0,8.1,6.0,7.5,7.1,7.1,3.6,3.4,4.9,3.2,3.7,3.6,2.7,4.6,4.4,3.8,5.0,3.8,4.6,4.0,3.5,3.7,3.4,2.5,3.6,2.6,3.0,2.6,2.8,1.6,2.5,2.1,2.7,2.7,2.5,3.1,4.0,2.8,2.6,1.8,2.0,1.6,1.1,1.4,2.1,1.8,2.7,1.8,2.5,1.9,1.8,2.9,2.3,1.4,1.4,1.4,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.1,0.8,0.8,1.8,0.7
"Los problemas de la agricultura, ganadería y pesca",0.1,0.2,0.1,0.3,0.1,0.1,0.2,0.4,0.1,0.0,0.1,0.1,0.1,0.2,0.1,0.2,0.2,0.1,0.5,0.2,0.2,0.4,0.1,0.2,0.4,0.4,0.2,0.0,0.1,0.1,0.3,0.2,0.4,0.5,0.3,0.2,0.4,0.3,0.4,0.2,0.3,0.8,0.4,0.4,0.4,0.4,0.2,0.5,0.2,0.2,0.6,0.3,0.2,0.7,0.2,0.4,0.4,0.8,0.8,0.7,0.6,0.6,0.3,0.7,0.6,0.5,0.2,0.3,0.3,0.3,0.3,0.2,0.1,0.3,0.9,1.0,0.7,0.6,0.2,0.3,0.3,0.3,0.4,0.4,0.6,0.4,0.6,0.4,0.5,0.5,0.4,0.3,0.5,0.9,0.3,0.3,0.3,0.5,0.6,1.0,0.5,0.4,0.6,0.7,0.8,0.6,0.8,0.8,1.0,0.8,1.0,1.4,0.8,1.0,0.8,1.0,0.6,0.9,0.8,0.7,0.5,0.3,0.2,0.4,0.5,0.6,0.4,0.7,0.8,0.6,0.4,0.6,0.6,0.7,1.1,0.6,0.4,0.5,0.6,1.0,0.6,0.3,0.4,0.5,0.5,0.5,0.3,0.4,0.6,0.8,0.4,0.6,0.6,0.2,0.2,0.7,0.7,0.7,0.5,1.6,2.0,.,.,.,.,.,1.1,1.2,5.7,1.6,4.3,1.0,5.0,.,.,3.8,3.2,1.8,1.9,6.2,3.4

Unnamed: 0,mar15,feb15,ene15,dic14,nov14,oct14,sep14,jul14,jun14,may14,abr14,mar14,feb14,ene14,dic13,nov13,oct13,sep13,jul13,jun13,may13,abr13,mar13,feb13,ene13,dic12,nov12,oct12,sep12,jul12,jun12,may12,abr12,mar12,feb12,ene12,dic11,nov11,oct11,sep11,jul11,jun11,may11,abr11,mar11,feb11,ene11,dic10,nov10,oct10,sep10,jul10,jun10,may10,abr10,mar10,feb10,ene10,dic09,nov09,oct09,sep09,jul09,jun09,may09,abr09,mar09,feb09,ene09,dic08,nov08,oct08,sep08,jul08,jun08,may08,abr08,mar08,feb08,ene08,dic07,nov07,oct07,sep07,jul07,jun07,may07,abr07,mar07,feb07,ene07,dic06,nov06,oct06,sep06,jul06,jun06,may06,abr06,mar06,feb06,ene06,dic05,nov05,oct05,sep05,jul05,jun05,may05,abr05,mar05,feb05,ene05,dic04,nov04,oct04,sep04,jul04,jun04,may04,abr04,mar04,feb04,ene04,dic03,nov03,oct03,sep03,jul03,jun03,may03,abr03,mar03,feb03,ene03,dic02,nov02,oct02,sep02,jul02,jun02,may02,abr02,mar02,feb02,ene02,dic01,nov01,sep01,jul01,jun01,may01,abr01,mar01,feb01,ene01,dic00,nov00,oct00,sep00,mar99,jul98,oct97,abr97,mar97,nov96,mar96,feb96,nov95,sep95,abr95,feb95,ene95,dic94,sep94,sep93,jun88,dic86,sep86,oct85,may85
El paro,80.3,78.6,79.4,75.5,77.0,76.0,75.3,77.0,76.8,80.8,80.3,82.3,81.1,78.5,77.0,77.7,77.4,77.3,80.9,80.5,82.4,80.7,81.6,79.9,81.1,77.1,80.8,77.9,79.3,78.6,77.8,81.7,81.7,84.0,84.0,83.3,82.0,83.0,81.0,80.4,81.3,82.6,84.1,82.8,81.8,83.9,82.4,78.6,79.5,81.1,78.4,78.0,75.9,79.9,79.7,82.9,81.8,82.7,79.0,78.4,73.0,76.4,74.3,73.7,77.3,75.7,76.1,75.2,75.3,72.5,71.5,64.9,62.2,56.1,53.8,52.5,52.0,45.8,44.7,43.8,38.6,40.0,37.4,35.0,36.5,38.5,42.2,37.5,38.1,40.5,37.6,42.1,40.1,40.7,42.1,47.0,43.2,46.3,49.8,52.7,49.0,49.8,46.6,51.5,49.0,53.0,52.0,54.9,53.1,54.5,57.7,56.8,58.2,53.9,57.3,57.1,58.3,56.9,59.3,59.0,56.8,51.7,61.0,60.4,57.9,60.2,57.8,58.4,61.2,56.5,63.6,63.4,56.8,60.0,58.8,61.1,64.3,58.7,60.2,63.0,63.1,65.3,65.7,60.6,58.7,61.9,62.9,57.6,61.0,59.4,59.6,59.5,63.4,57.1,63.3,56.0,59.9,63.1,56.9,59.8,75.9,81.0,85.3,87.6,81.3,83.1,78.7,86.0,77.6,76.3,79.9,76.6,84.9,82.6,80.0,89.6,91.0,92.0,94.1,93.1,94.4
Las drogas,0.3,0.3,0.1,0.2,0.3,0.3,0.3,0.2,0.2,0.1,0.3,0.1,0.2,0.2,0.5,0.1,0.2,0.4,0.3,0.3,0.2,.,0.1,0.3,0.2,0.5,0.3,0.4,0.4,0.2,0.4,0.5,0.4,0.5,0.6,0.6,0.6,0.4,0.6,0.8,0.5,0.7,0.9,0.9,0.6,0.8,0.6,1.4,1.1,1.3,1.0,0.7,0.6,1.4,1.1,0.8,1.0,1.1,1.9,1.2,1.5,1.5,1.7,2.1,1.6,1.7,1.5,1.3,1.5,1.6,2.3,2.4,1.9,2.3,2.0,1.8,1.6,2.9,3.0,2.5,3.5,4.1,4.1,4.1,4.8,5.3,4.8,4.7,3.9,5.5,3.7,6.3,5.2,6.0,4.8,5.8,6.1,5.4,4.8,6.0,6.6,5.4,6.5,5.6,7.5,6.4,6.7,6.5,6.5,6.8,8.0,6.5,8.5,10.0,10.6,9.0,9.5,9.2,9.1,7.2,5.9,5.7,8.9,10.0,10.9,12.0,9.9,12.0,13.0,14.3,9.2,9.1,7.2,9.1,9.8,9.4,9.9,11.8,11.3,13.1,11.3,15.8,17.4,23.2,15.6,14.5,15.4,14.8,14.3,16.1,19.4,18.0,17.5,13.3,14.8,12.3,15.2,14.9,14.6,14.7,25.3,27.5,29.2,28.9,23.5,26.5,19.4,19.2,16.0,16.4,17.6,24.5,18.7,28.7,25.9,35.8,49.0,26.3,10.5,19.3,17.5
La inseguridad ciudadana,2.2,2.2,2.6,3.4,1.7,2.9,3.2,2.5,1.6,2.8,3.1,2.0,2.3,2.3,3.4,1.7,2.6,2.8,2.8,3.0,2.6,2.5,2.8,2.3,2.9,2.8,2.5,3.1,3.1,4.0,3.9,4.5,5.4,6.3,6.6,5.7,5.6,5.8,6.4,7.4,6.7,7.7,7.9,8.0,6.8,9.4,7.3,8.1,8.3,7.3,7.5,8.1,7.3,9.7,9.3,9.2,8.4,8.1,11.8,11.4,9.9,12.8,10.3,12.6,11.4,11.1,13.1,11.0,11.7,11.1,11.4,10.2,11.5,10.0,10.7,12.3,12.2,14.9,18.5,16.4,14.4,14.5,15.7,12.9,13.5,13.6,14.6,16.7,14.9,15.9,18.9,21.1,20.5,16.1,16.0,21.0,24.3,26.3,17.7,16.6,19.1,23.3,21.5,14.4,13.4,12.1,14.6,15.1,11.3,14.2,16.0,16.3,16.7,17.7,17.9,16.6,16.5,18.7,18.1,17.3,15.6,15.7,21.2,23.6,21.4,23.0,27.5,25.4,23.4,29.1,26.1,23.2,18.5,18.5,25.5,17.5,22.0,24.8,22.0,19.1,16.8,22.4,17.7,19.1,21.8,18.1,14.2,14.9,8.3,9.4,13.7,8.9,9.5,9.3,8.0,8.4,9.4,9.2,9.5,10.4,15.8,13.0,14.6,19.5,11.8,15.4,28.6,13.1,14.0,11.1,8.8,12.7,10.3,16.5,10.8,17.3,36.3,30.0,30.9,28.0,32.9
El terrorismo. ETA,0.2,0.8,1.1,0.5,0.1,0.4,0.6,0.5,0.5,0.4,0.7,0.9,1.2,1.5,0.5,0.9,0.3,0.6,0.6,0.4,0.5,0.6,0.4,0.4,0.3,0.2,0.6,0.5,1.1,1.0,0.9,1.2,1.0,1.3,1.7,1.2,2.5,3.7,3.3,3.7,4.8,5.3,9.1,5.7,5.7,5.6,7.4,6.0,9.9,6.9,9.0,5.5,7.0,9.2,12.3,11.1,12.5,17.6,12.9,13.1,12.6,18.3,19.1,13.6,15.5,16.0,19.4,20.6,22.2,28.9,21.8,22.5,18.6,20.3,22.9,31.4,31.4,36.9,31.4,35.1,39.6,29.1,31.7,35.4,44.1,41.7,32.3,36.5,38.6,42.7,45.3,27.8,25.4,18.9,18.0,26.9,22.8,18.5,24.9,28.0,35.0,24.9,26.6,24.1,23.9,34.2,50.4,41.7,39.5,34.0,40.7,46.1,53.3,55.1,42.1,46.6,41.3,44.1,45.4,48.1,63.0,73.4,40.1,34.8,39.7,38.4,38.7,42.9,39.5,43.3,44.5,41.5,37.0,45.0,48.7,43.9,53.3,53.5,64.0,50.2,49.0,51.0,54.2,54.1,55.9,59.5,62.3,65.7,70.6,73.5,56.7,70.9,63.1,70.3,61.8,65.9,77.0,80.1,67.9,65.5,25.8,49.7,33.4,37.2,32.6,20.6,36.4,42.8,17.1,12.2,34.8,10.8,11.3,7.5,13.0,12.6,30.1,38.0,66.5,33.1,37.6
Las infraestructuras,0.2,0.1,0.2,0.1,0.1,0.0,0.0,0.1,0.2,0.0,0.0,0.2,.,0.1,0.1,0.0,.,.,.,0.1,.,.,0.2,0.2,0.4,0.2,0.1,0.5,0.1,0.2,0.4,0.2,0.3,0.3,0.2,0.4,0.2,0.2,0.1,0.4,0.6,0.5,0.4,0.4,0.5,0.4,0.6,0.6,0.2,0.6,0.3,0.6,0.3,0.5,0.5,0.8,0.6,0.6,0.6,0.4,0.5,0.7,0.7,0.4,0.5,0.4,0.6,0.7,0.8,0.8,0.3,0.7,0.8,0.7,1.1,0.7,0.7,0.7,0.7,1.3,1.1,2.0,2.3,1.0,1.1,1.6,1.1,1.5,0.8,0.5,0.6,0.8,0.9,1.6,0.9,1.7,1.0,1.0,1.2,0.6,0.8,0.7,1.2,0.5,0.7,0.8,1.5,0.8,1.1,1.2,0.9,2.2,1.2,1.1,0.9,1.8,1.3,2.4,0.8,0.8,1.0,0.8,1.0,2.5,1.7,1.5,1.4,1.3,1.5,1.9,1.6,1.0,0.8,1.0,1.0,0.8,1.2,1.0,0.8,1.5,0.7,0.9,1.0,1.2,0.8,1.0,0.8,1.0,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,2.7,1.8,0.9,0.3,1.2,0.4
La sanidad,11.8,12.1,10.2,10.6,8.6,11.8,11.5,11.3,11.5,10.4,10.4,10.2,10.8,11.6,10.9,13.0,13.4,10.6,10.3,9.3,10.0,7.9,10.1,10.5,12.1,12.9,11.4,10.1,9.1,10.4,8.6,10.6,8.6,9.3,7.1,8.1,9.3,7.7,7.8,6.1,4.3,4.4,4.1,3.7,3.4,2.9,3.6,3.0,3.0,3.9,3.4,4.0,3.2,3.3,3.2,4.1,4.3,3.7,4.8,5.6,4.8,4.9,4.1,3.8,4.1,4.8,5.3,4.7,4.5,2.9,3.4,4.4,3.6,3.8,4.1,3.6,4.8,4.6,4.4,5.5,4.2,3.7,4.0,3.9,4.6,4.1,5.6,5.0,4.0,4.1,4.2,4.5,5.4,3.9,3.7,4.1,5.0,4.1,4.8,4.8,4.0,5.2,5.8,5.5,6.8,6.4,4.2,4.5,6.0,5.6,5.7,6.5,6.5,6.2,5.8,5.0,4.6,6.2,5.0,4.7,5.3,3.0,6.3,5.6,5.0,6.0,4.7,4.3,3.5,3.6,5.4,5.8,3.9,3.0,3.5,2.9,3.8,3.1,2.7,3.9,3.3,3.4,4.2,3.4,3.6,4.0,4.4,6.2,3.9,3.9,3.9,3.8,3.9,3.2,3.1,4.3,3.2,4.7,5.0,3.9,.,.,.,.,.,.,.,.,.,.,.,.,.,2.7,.,7.5,.,.,.,.,.
La vivienda,1.9,1.1,1.5,1.0,0.9,1.3,1.6,1.6,1.6,1.3,1.7,1.6,1.6,1.5,0.9,2.0,1.4,2.3,1.7,1.9,2.8,2.5,2.0,1.7,3.1,3.9,3.9,2.7,1.8,3.4,3.6,3.5,5.2,4.3,5.7,5.6,5.9,6.0,5.1,4.9,6.7,7.2,4.0,5.5,5.1,4.0,4.4,5.2,6.0,5.3,6.1,6.7,5.3,7.0,6.5,7.9,7.1,9.4,8.5,10.3,7.6,10.1,10.5,11.6,13.0,13.1,11.3,11.6,13.5,15.5,16.6,18.3,18.4,21.5,21.5,24.5,25.6,28.8,28.9,29.1,28.8,32.9,34.9,37.3,36.7,30.8,34.9,32.5,27.6,27.0,29.8,29.8,25.1,30.3,20.9,26.5,24.6,25.9,24.8,18.9,17.7,18.2,16.8,20.5,19.4,23.3,21.2,21.4,22.0,25.5,19.1,20.0,19.6,15.6,20.5,21.9,18.1,18.4,20.0,22.0,22.1,19.4,20.3,18.3,17.3,17.1,16.9,10.9,13.9,12.2,15.3,12.8,7.2,7.3,8.9,7.3,9.6,10.2,3.6,5.2,3.4,3.1,4.9,3.5,2.7,3.8,3.7,3.8,2.3,3.5,4.4,2.7,2.9,2.0,2.2,2.8,2.9,2.5,4.2,2.8,.,.,.,.,.,.,.,.,.,.,.,.,.,.,2.6,5.4,1.7,1.9,3.2,2.2,2.1
Los problemas de índole económica,24.9,24.9,24.5,24.9,25.5,27.0,28.8,28.0,28.4,28.6,28.0,28.2,28.3,30.5,29.7,31.1,32.7,32.5,32.0,32.2,34.9,35.5,34.4,35.4,38.9,39.5,42.7,43.4,49.4,46.5,46.3,51.0,52.8,49.2,52.3,53.7,50.4,48.2,51.3,49.6,49.6,47.0,46.6,47.3,51.3,51.6,53.1,52.0,47.9,47.8,48.2,51.3,53.0,50.9,46.8,45.3,47.8,47.0,47.0,45.9,46.8,48.4,48.5,48.3,48.7,54.1,52.0,54.4,51.9,54.5,57.9,54.7,54.2,59.9,58.3,51.9,48.4,39.4,36.4,39.2,32.9,29.4,24.5,21.8,16.5,15.5,15.5,18.0,15.7,15.7,18.3,15.5,15.6,17.1,15.6,19.0,19.4,17.3,19.7,16.0,15.0,17.2,16.7,17.1,21.5,16.4,17.6,17.4,17.3,15.4,11.7,9.9,11.3,11.2,12.0,14.3,11.5,12.1,11.1,11.9,9.9,7.9,10.6,12.5,11.7,13.1,12.7,10.7,10.5,9.9,8.4,11.1,9.7,9.2,11.0,10.9,12.8,10.4,9.4,8.8,9.2,8.5,8.7,7.6,8.5,10.0,10.5,11.8,10.3,9.5,9.0,7.6,9.2,5.5,6.4,8.2,11.7,12.9,15.1,14.2,8.7,9.6,12.8,10.8,11.1,13.1,20.5,11.7,17.5,20.8,23.3,29.6,26.5,24.3,24.6,35.8,10.1,19.8,23.1,27.3,28.0
Los problemas relacionados con la calidad del empleo,3.4,3.9,3.7,4.9,2.4,4.8,5.0,2.5,4.7,3.0,2.2,2.3,2.3,2.3,1.8,1.3,2.0,2.0,1.5,1.8,1.3,1.4,1.1,1.1,1.6,1.8,1.4,2.6,2.3,2.3,2.7,2.1,2.2,3.6,3.3,2.2,2.5,2.3,2.3,2.7,2.0,3.0,2.3,3.3,2.8,3.1,2.7,1.9,2.3,3.4,4.0,3.2,3.4,2.9,2.4,3.1,3.0,3.2,2.6,2.8,2.7,2.9,2.9,5.0,3.0,2.9,3.1,3.3,3.9,2.7,4.0,5.4,6.4,8.4,8.6,8.7,8.2,9.4,11.0,12.3,12.3,12.2,13.7,13.5,12.7,12.4,9.5,14.5,9.0,9.8,9.8,9.7,9.9,10.7,7.8,8.8,9.3,7.2,10.0,9.3,7.3,8.4,7.9,7.9,8.0,8.1,6.0,7.5,7.1,7.1,3.6,3.4,4.9,3.2,3.7,3.6,2.7,4.6,4.4,3.8,5.0,3.8,4.6,4.0,3.5,3.7,3.4,2.5,3.6,2.6,3.0,2.6,2.8,1.6,2.5,2.1,2.7,2.7,2.5,3.1,4.0,2.8,2.6,1.8,2.0,1.6,1.1,1.4,2.1,1.8,2.7,1.8,2.5,1.9,1.8,2.9,2.3,1.4,1.4,1.4,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,.,1.1,0.8,0.8,1.8,0.7
"Los problemas de la agricultura, ganadería y pesca",0.1,0.2,0.1,0.3,0.1,0.1,0.2,0.4,0.1,0.0,0.1,0.1,0.1,0.2,0.1,0.2,0.2,0.1,0.5,0.2,0.2,0.4,0.1,0.2,0.4,0.4,0.2,0.0,0.1,0.1,0.3,0.2,0.4,0.5,0.3,0.2,0.4,0.3,0.4,0.2,0.3,0.8,0.4,0.4,0.4,0.4,0.2,0.5,0.2,0.2,0.6,0.3,0.2,0.7,0.2,0.4,0.4,0.8,0.8,0.7,0.6,0.6,0.3,0.7,0.6,0.5,0.2,0.3,0.3,0.3,0.3,0.2,0.1,0.3,0.9,1.0,0.7,0.6,0.2,0.3,0.3,0.3,0.4,0.4,0.6,0.4,0.6,0.4,0.5,0.5,0.4,0.3,0.5,0.9,0.3,0.3,0.3,0.5,0.6,1.0,0.5,0.4,0.6,0.7,0.8,0.6,0.8,0.8,1.0,0.8,1.0,1.4,0.8,1.0,0.8,1.0,0.6,0.9,0.8,0.7,0.5,0.3,0.2,0.4,0.5,0.6,0.4,0.7,0.8,0.6,0.4,0.6,0.6,0.7,1.1,0.6,0.4,0.5,0.6,1.0,0.6,0.3,0.4,0.5,0.5,0.5,0.3,0.4,0.6,0.8,0.4,0.6,0.6,0.2,0.2,0.7,0.7,0.7,0.5,1.6,2.0,.,.,.,.,.,1.1,1.2,5.7,1.6,4.3,1.0,5.0,.,.,3.8,3.2,1.8,1.9,6.2,3.4


In [7]:
def get_column(index, matrix):
    return [row[index] for row in matrix]

'''
my_list = get_column(0,topic_list)
for topic in my_list:
    print(topic)
'''

'\nmy_list = get_column(0,topic_list)\nfor topic in my_list:\n    print(topic)\n'

# Proposta de tòpics per als baròmetres del CIS

### Selecció de tòpics entre els anys 2000-2015

In [8]:
CIS_topics = \
    [(u"paro",[u"El paro",u"Paro"]),
    (u"política",[u"Los políticos en general, los partidos y la política",u"Los políticos en general, los partidos políticos y la política",u"Problemas políticos (el Gobierno, la política)",u"La clase política, los partidos políticos",u"El Gobierno, los políticos y los partidos",u"El Gobierno y partidos o políticos concretos",u"Los/as políticos/as en general, los partidos y la política",u"El Gobierno y partidos o políticos/as concretos",u"El Gobierno, los políticos y los partidos concretos",u"Problemas políticos",u"Los problemas políticos",u"Problemas políticos (desconfianza, mala gestión)"]),
    (u"droga",[u"Drogas. Alcoholismo.",u"La droga",u"Droga",u"Droga, alcoholismo",u"Las drogas",u"Drogas"]),
    (u"terrorismo",[u"El terrorismo",u"Terrorismo",u"El terrorismo, ETA",u"Terrorismo, ETA",u"El terrorismo internacional",u"Terrorismo, etc.",u"El atentado del 11-M. Terrorismo islamista",u"El atentado del 11-M",u"Atentado New York y sus consecuencias"]),
    (u"inmigración",[u"La inmigración",u"Inmigración"]),
    (u"delincuencia",[u"Delincuencia. Inseguridad ciudadana",u"La delincuencia e inseguridad ciudadana",u"Delincuencia e inseguridad ciudadana",u"Delincuencia, inseguridad ciudadana. Violencia",u"La inseguridad ciudadana",u"Inseguridad ciudadana",u"Delincuencia, inseguridad ciudadana"]),
    (u"servicios públicos",[u"Servicios públicos (escasez, mal funcionamiento)",u"La escasez y mal funcionamiento de los servicios públicos",u"Servicios públicos: escasez, mal funcionamiento (sanidad, vivienda, carreteras, tráfico, etc.)",u"Escasez y/o mal funcionamiento de los servicios públicos",u"Funcionamiento y cobertura de los servicios públicos",u"El funcionamiento de los servicios públicos",u"Problemas sociales (falta de servicios y ayudas sociales)",u"Los problemas sociales (falta de servicios y ayudas)"]),
    (u"vivienda",[u"La vivienda",u"Vivienda",u"Los desahucios",u"Desahucios",u"Las hipotecas",u"Hipotecas"]),
    (u"situación económica",[u"La situación económica (carestía, precios, sueldos, etc.)",u"Situación económica",u"La situación económica (carestía, sueldos, impuestos)",u"Problemas económicos",u"Los problemas de índole económica",u"Problemas económicos (Poca inversión, déficit público, salarios bajos)",u"Subida de los carburantes",u"La subida del IVA",u"Subida del IVA",u"Subida de tarifas energéticas"]),
    (u"corrupción",[u"La corrupción y el fraude",u"Corrupción y fraude",u"El fraude fiscal",u"Fraude fiscal",u"Corrupción, fraude",u"Corrupción política"]),
    (u"guerras",[u"Las guerras",u"La guerra de Irak",u"La guerra. Las guerras en general",u"Las guerras en general",u"La guerra en Afganistán",u"La guerra de Afganistán",u"La guerra de Irak. Las guerras en general"]),
    (u"educación",[u"La educación",u"Educación",u"Sistema educativo",u"El sistema educativo"]),
    (u"recortes",[u"Los recortes",u"\"Los recortes\""]),
    (u"problemas ganadería alimentación agricultura pesca",[u"Los problemas de la ganadería y la alimentación",u"Los problemas de la agricultura, ganadería y pesca",u"Problemas de la agricultura, ganadería y pesca",u"Problemas de la agricultura",u"Alimentación",u"Acuerdos pesqueros",u"Los acuerdos pesqueros"]),
    (u"problemas sociales",[u"Problemas sociales",u"Los problemas de índole social",u"Problemas sociales (racismo, pobreza, etc.)"]),
    (u"bancos",[u"Los bancos",u"Bancos",u"Los Bancos"]),
    (u"problemas relacionados empleo",[u"Problemas relacionados con el empleo",u"Precariedad en el empleo",u"Los problemas relacionados con la calidad del empleo"]),
    (u"conflicto islote perejil",[u"El conflicto del islote de Perejil"]),
    (u"pensiones",[u"Las pensiones"]),
    (u"violencia mujer",[u"La violencia contra la mujer",u"Violencia contra la mujer"]),
    (u"crisis valores",[u"La crisis de valores",u"Crisis de valores",u"Crisis de valores sociales",u"Déficit de valores sociales",u"La crisis de valores (racismo, incomunicación)"]),
    (u"justicia",[u"Justicia",u"La actuación judicial",u"La Administración de Justicia",u"La actuación judicial (lentitud, errores, etc.)",u"Las actuaciones judiciales",u"Las excarcelaciones"]),
    (u"problemas juventud",[u"Problemas de la juventud",u"Los problemas relacionados con la juventud",u"Problemas relacionados con el comportamiento social de los jóvenes",u"Problemas relacionados con el comportamiento de los jóvenes",u"Problemas relacionados con el ocio de los jóvenes",u"El futuro de los hijos"]),
    (u"euro",[u"El euro",u"Euro",u"Problemas relacionados con la UE y la entrada en el Euro",u"Inflación por el euro",u"Subida del precio a causa del euro",u"Subida de precios a causa del euro"]),
    (u"medio ambiente",[u"Medio ambiente",u"Medio ambiente, contaminación",u"Contaminación, medio ambiente",u"Los problemas medioambientales",u"Problemas ecológicos y del medio ambiente",u"Problemas del medio ambiente (contaminación, desertización, incendios, etc.)",u"El accidente del Prestige",u"El accidente del petrolero Prestige",u"El desastre del Prestige"]),
    (u"nacionalismos",[u"Los nacionalismos",u"Los nacionalismos en España"]),
    (u"problemas derivados] autonomías",[u"Problemas derivados de las autonomías"]),
    (u"infraestructuras",[u"Las infraestructuras",u"Infraestructuras"]),
    (u"otros",[u"N.C.",u"No_contesta",u"No_Contesta",u"No_sabe",u"No_Sabe",u"N.S.",u"Ninguno",u"Otras respuestas",u"Otros",u"Otros problemas"]),
    (u"monarquía",[u"La Monarquía",u"Monarquía"]),
    (u"racismo",[u"El racismo",u"Racismo"]),
    (u"estatuto autonomía",[u"El Estatuto de autonomía",u"Estatutos de autonomía",u"El Estatuto de Cataluña",u"El estatuto de Cataluña",u"La reforma de los Estatutos de Autonomía",u"La Reforma de los Estatutos de autonomía",u"La reforma de los Estatutos de autonomía"]),
    (u"sanidad",[u"La salud",u"La sanidad",u"Sanidad",u"'Vacas locas'",u"'Vacas locas",u"El problema de las vacas locas",u"Ébola"]),
    (u"ley antitabaco",[u"La Ley antitabaco",u"Ley antitabaco"]),
    (u"preocupaciones situaciones personales",[u"Las preocupaciones y situaciones personales"]),
    (u"problemas relacionados mujer",[u"Los problemas relacionados con la mujer",u"Problemas relacionados con la mujer",u"Los problemas laborales y familiares de las mujeres",u"Problemas laborales y familiares de la mujer"]),
    (u"limitación velocidad autopistas autovías",[u"La limitación de la velocidad en autopistas y autovías"]),
    (u"situación país vasco",[u"La situación del País Vasco",u"Problemas políticos en el P. Vasco",u"Problemas políticos en el País Vasco",u"La situación política del País Vasco",u"Las negociaciones con ETA",u"Negociaciones con ETA"]),
    (u"ley aborto",[u"La Ley del aborto"]),
    (u"familia",[u"La familia"]),
    ("reforma laboral",[u"La reforma del desempleo",u"Reforma del Seguro de Desempleo",u"La ley sobre la reforma del desempleo",u"La Ley sobre la reforma del desempleo",u"Reforma Laboral",u"La reforma laboral",u"Reforma laboral"])]

print len(CIS_topics)
print CIS_topics

41
[(u'paro', [u'El paro', u'Paro']), (u'pol\xedtica', [u'Los pol\xedticos en general, los partidos y la pol\xedtica', u'Los pol\xedticos en general, los partidos pol\xedticos y la pol\xedtica', u'Problemas pol\xedticos (el Gobierno, la pol\xedtica)', u'La clase pol\xedtica, los partidos pol\xedticos', u'El Gobierno, los pol\xedticos y los partidos', u'El Gobierno y partidos o pol\xedticos concretos', u'Los/as pol\xedticos/as en general, los partidos y la pol\xedtica', u'El Gobierno y partidos o pol\xedticos/as concretos', u'El Gobierno, los pol\xedticos y los partidos concretos', u'Problemas pol\xedticos', u'Los problemas pol\xedticos', u'Problemas pol\xedticos (desconfianza, mala gesti\xf3n)']), (u'droga', [u'Drogas. Alcoholismo.', u'La droga', u'Droga', u'Droga, alcoholismo', u'Las drogas', u'Drogas']), (u'terrorismo', [u'El terrorismo', u'Terrorismo', u'El terrorismo, ETA', u'Terrorismo, ETA', u'El terrorismo internacional', u'Terrorismo, etc.', u'El atentado del 11-M. Terrorismo i

In [9]:
from pymongo import ASCENDING, DESCENDING
#
barometro_docs_collection = db['barometro_docs']
#
barometro_topics_collection = db['barometro_topics']
barometro_topics_collection.drop()


result = barometro_docs_collection.find().sort('date',DESCENDING)

if result.count() > 0 :
    num_docs = result.count()
    print "Num. of docs:", num_docs
    
    for doc in result:
        if len(doc['barometro_dict']) > 0:
            topic_dictionary = {}
            for response in doc['barometro_dict']:
                response_topic = ""
                for topic in CIS_topics:
                    if response[0] in topic[1]:
                        response_topic = topic[0]
                
                if response_topic:
                    if response_topic in topic_dictionary:
                        topic_dictionary[response_topic]['value'] += response[1]
                    else:
                        topic_dictionary[response_topic] = {'value':response[1]}
                        
                else:
                    print "WARN:", "'" + response[0] + "'", "not found in CIS_topics"

            # dict to list
            topic_list = []
            for topic in topic_dictionary:
                topic_list.append([topic,topic_dictionary[topic]['value']])
            
            # Normalitzar i ordenar per percentage
            total_topic_value = sum([value[1] for value in topic_list])
            normalized_topic_list = [[topic[0],topic[1]/total_topic_value] for topic in topic_list]
            normalized_topic_list.sort(key=lambda tup: tup[1], reverse=True)
            
            # Insertar el topic a la bbdd
            month = int(datetime.datetime.strftime(doc['date'],'%m'))
            year = int(datetime.datetime.strftime(doc['date'],'%Y'))
            barometro_topics_row = {'date':doc['date'],'month':month,'year':year,'id_doc':doc['_id'],'topics_dict':normalized_topic_list}
            barometro_topics_collection.insert(barometro_topics_row)


Num. of docs: 229


#### Querying

In [10]:
#
barometro_topics_collection = db['barometro_topics']

doc_start_date = "01/01/1993"
d_doc_start_date = datetime.datetime.strptime(doc_start_date, "%d/%m/%Y")
d_doc_end_date = datetime.datetime.utcnow()

result = barometro_topics_collection.find({'date': {'$gte': d_doc_start_date, '$lte': d_doc_end_date}}).sort('date',DESCENDING)

if result.count() > 0 :
    num_topics = result.count()
    print "Num. of topics:", num_topics
    
    for topic in result:
        print topic

Num. of topics: 161
{u'month': 1, u'year': 2015, u'date': datetime.datetime(2015, 1, 2, 0, 0), u'_id': ObjectId('55362eabc8fbfe094076d975'), u'topics_dict': [[u'paro', 0.32119741100323623], [u'corrupci\xf3n', 0.2261326860841424], [u'situaci\xf3n econ\xf3mica', 0.10275080906148867], [u'pol\xedtica', 0.09951456310679611], [u'sanidad', 0.04126213592233009], [u'problemas sociales', 0.03964401294498382], [u'educaci\xf3n', 0.028721682847896435], [u'otros', 0.023058252427184463], [u'problemas relacionados empleo', 0.014967637540453074], [u'recortes', 0.013754045307443364], [u'vivienda', 0.013754045307443364], [u'delincuencia', 0.010517799352750809], [u'inmigraci\xf3n', 0.009304207119741099], [u'justicia', 0.00889967637540453], [u'pensiones', 0.008090614886731391], [u'problemas juventud', 0.007281553398058252], [u'crisis valores', 0.006877022653721682], [u'terrorismo', 0.006067961165048544], [u'bancos', 0.006067961165048544], [u'nacionalismos', 0.004449838187702265], [u'violencia mujer', 0.001

In [11]:
#
barometro_topics_collection = db['barometro_topics']

result = barometro_topics_collection.find({'month': 11,'year': 2014})

if result.count() > 0:
    print "Num. of topics:", result.count()
    topic_row = result[0]
    print "Date:", topic_row['date']
    for topic in topic_row['topics_dict'][:15]:
        print topic[0], ": %0.2f %%" % (100.*topic[1])

Num. of topics: 1
Date: 2014-11-04 00:00:00
paro : 30.51 %
corrupción : 25.44 %
situación económica : 10.66 %
política : 10.34 %
problemas sociales : 3.53 %
sanidad : 3.41 %
educación : 3.05 %
otros : 2.65 %
recortes : 1.62 %
justicia : 1.43 %
inmigración : 0.99 %
problemas relacionados empleo : 0.95 %
crisis valores : 0.87 %
vivienda : 0.71 %
delincuencia : 0.67 %


In [12]:
#
barometro_docs_collection = db['barometro_docs']
#
barometro_topics_collection = db['barometro_topics']

result_topic = barometro_topics_collection.find({'month': 11,'year': 2014})

if result_topic.count() > 0:
    topic_row = result_topic[0]
    print "Date:", topic_row['date']
    
    result_topic_doc = barometro_docs_collection.find({'_id':topic_row['id_doc']})
    if result_topic_doc.count() > 0:
        topic_doc_row = result_topic_doc[0]
    
    topic_list = topic_row['topics_dict']
    topic_doc_list = topic_doc_row['barometro_dict']
    max_list_length = max(len(topic_list),len(topic_doc_list))
    max_label_length = 45
    
    total_topic_doc_value = sum([topic[1] for topic in topic_doc_list])
    index = 0
        
    print "-" * 120
    print "Normalizado:"
    print "-" * 120
    index = 0
    while index < max_list_length:
        if index < len(topic_list):
            topic_label = topic_list[index][0][:max_label_length]
            topic_value = 100*topic_list[index][1]
        else:
            topic_label = '-' * max_label_length
            topic_value = 0.0
            
        if index < len(topic_doc_list):
            topic_doc_label = topic_doc_list[index][0]
            topic_doc_value = 100*topic_doc_list[index][1]/total_topic_doc_value
        else:
            topic_doc_label = '-' * max_label_length
            topic_doc_value = 0.0
        
        print topic_label, ": {0:>{1}}".format("%0.1f" % topic_value,50-len(topic_label)), \
            '|', "%0.1f" % topic_doc_value, ":\t", topic_doc_label
            
        index += 1
        
    print "TOTAL", ": {0:>{1}}".format(100.0,50-len("TOTAL")), \
          '|', 100.0, ": ", "TOTAL"
      

Date: 2014-11-04 00:00:00
------------------------------------------------------------------------------------------------------------------------
Normalizado:
------------------------------------------------------------------------------------------------------------------------
paro :                                           30.5 | 30.5 :	El paro
corrupción :                                     25.4 | 25.3 :	La corrupción y el fraude
situación económica :                            10.7 | 10.1 :	Los problemas de índole económica
política :                                       10.3 | 9.2 :	Los/as políticos/as en general, los partidos y la política
problemas sociales :                              3.5 | 3.5 :	Los problemas de índole social
sanidad :                                         3.4 | 3.4 :	La sanidad
educación :                                       3.1 | 3.1 :	La educación
otros :                                           2.7 | 2.3 :	Otras respuestas
recortes :           

# CIS vs. Congrés

Recuperem el llistat de topics del Congrés

In [85]:
import pandas as pd

df_congres = pd.read_csv('metric_table_by_months.csv', delimiter=';')
df_congres

Unnamed: 0.1,Unnamed: 0,Year,Month,vivienda,justicia,educación,empleo,medio ambiente,salud,jóvenes,...,internacional,cultura,inseguridad ciudadana,social,servicios públicos e infraestructuras,corrupción,fuerzas armadas,tecnologia e investigación,terrorismo,otros
0,0,2000,9,0.063609,0.000000,0.035133,0.070636,0.092456,0.015163,0.036982,...,0.000000,0.010725,0.010725,0.022559,0.034763,0.039941,0.000000,0.029956,0.000000,0.349482
1,1,2000,10,0.000000,0.032402,0.007263,0.044693,0.079330,0.096089,0.037430,...,0.104469,0.003911,0.017318,0.000000,0.088827,0.000000,0.000000,0.011732,0.020112,0.261453
2,2,2000,11,0.000000,0.042169,0.044578,0.061446,0.020482,0.000000,0.089157,...,0.166265,0.000000,0.037349,0.000000,0.185542,0.000000,0.000000,0.000000,0.030120,0.190361
3,3,2000,12,0.000000,0.048327,0.019827,0.055762,0.135688,0.134449,0.000000,...,0.000000,0.009913,0.000000,0.000000,0.122677,0.025403,0.027261,0.020446,0.025403,0.306072
4,131,2001,1,0.000000,0.045409,0.016725,0.054803,0.168720,0.087172,0.032112,...,0.029679,0.022148,0.005028,0.000000,0.091017,0.030055,0.020442,0.024171,0.012701,0.268021
5,4,2001,2,0.000000,0.042491,0.013623,0.053844,0.201752,0.039896,0.064223,...,0.059358,0.034382,0.010055,0.000000,0.059358,0.034706,0.013623,0.027895,0.000000,0.229971
6,5,2001,3,0.000000,0.011372,0.013149,0.129353,0.143568,0.060768,0.035181,...,0.053660,0.000000,0.000000,0.024876,0.078536,0.015636,0.030206,0.029140,0.017058,0.295309
7,6,2001,4,0.000000,0.069149,0.032270,0.112411,0.145035,0.063475,0.028014,...,0.024823,0.000000,0.000000,0.012766,0.097872,0.012057,0.000000,0.014539,0.037234,0.239716
8,7,2001,5,0.009573,0.000000,0.018803,0.045470,0.150769,0.056410,0.074530,...,0.054359,0.020855,0.020855,0.000000,0.124103,0.012650,0.000000,0.028034,0.026325,0.252308
9,8,2001,6,0.040000,0.067200,0.013333,0.055467,0.161600,0.038933,0.000000,...,0.076800,0.014933,0.012267,0.036267,0.128000,0.065600,0.021333,0.025600,0.016000,0.189867


In [86]:
# result_topic = barometro_topics_collection.find({'month': 11,'year': 2014})

df_congres_result = df_congres[(df_congres.Month==11) & (df_congres.Year==2014)]
df_congres_result


Unnamed: 0.1,Unnamed: 0,Year,Month,vivienda,justicia,educación,empleo,medio ambiente,salud,jóvenes,...,internacional,cultura,inseguridad ciudadana,social,servicios públicos e infraestructuras,corrupción,fuerzas armadas,tecnologia e investigación,terrorismo,otros
156,126,2014,11,0,0.093611,0,0.052006,0.055721,0.028232,0.270431,...,0.017831,0.027489,0.037147,0.026003,0.090639,0.069094,0,0,0,0.155275


Obtenim el diccionari topic_congres_dict on per a cada entrada (data) obtenim la llista de tòpics ordenada pel seu percentatge.

In [87]:
import datetime

topic_congres_dict = {}
for idr, row in df_congres.iterrows():
    congres_date = datetime.datetime(int(row.Year),int(row.Month),1).strftime('%Y-%m-%d')

    topic_congres_dict[congres_date] = []
    for idc, column in enumerate(df_congres.columns):
        if idc > 3:
            value = df_congres.ix[idr,idc]
            if value > 0:
                topic_congres_dict[congres_date].append([u"".join(column.decode('utf-8')),df_congres.ix[idr,idc]])
    # ordenar
    topic_congres_dict[congres_date].sort(key=lambda col: col[1], reverse=True)

print topic_congres_dict


{'2001-02-01': [[u'otros', 0.22997080765500003], [u'medio ambiente', 0.20175154070700002], [u'econom\xeda', 0.10509244242599999], [u'j\xf3venes', 0.064223159260499998], [u'internacional', 0.059357768407400001], [u'servicios p\xfablicos e infraestructuras', 0.059357768407400001], [u'empleo', 0.053843658773900002], [u'justicia', 0.042491080116800008], [u'salud', 0.0398962049951], [u'corrupci\xf3n', 0.034706454751899998], [u'cultura', 0.034382095361699998], [u'tecnologia e investigaci\xf3n', 0.027894907557599997], [u'educaci\xf3n', 0.013623094388600002], [u'fuerzas armadas', 0.013623094388600002], [u'inseguridad ciudadana', 0.010055141096299999], [u'ideolog\xeda', 0.0097307817061299991]], '2005-03-01': [[u'otros', 0.29268292682899999], [u'econom\xeda', 0.10945865556199999], [u'medio ambiente', 0.10469958358100001], [u'servicios p\xfablicos e infraestructuras', 0.09220701963120001], [u'terrorismo', 0.083878643664499999], [u'empleo', 0.051160023795400002], [u'internacional', 0.0410469958358

Fem el mateix per a les dades del CIS.

Carregem el csv del CIS:

In [88]:
import pandas as pd

df_CIS = pd.read_csv('CIS - Dades_pfs.csv', delimiter=',')
df_CIS.fillna(-1)

Unnamed: 0,topic,maxim,1/3/2015,1/2/2015,1/1/2015,1/12/2014,1/11/2014,1/10/2014,1/9/2014,1/7/2014,...,2/1/1995,1/1/1995,12/1/1994,9/1/1994,9/1/1993,6/1/1988,12/1/1986,9/1/1986,10/1/1985,5/1/1985
0,El paro,84.1,80.3,78.6,79.4,75.5,77.0,76.0,75.3,77.0,...,76.6,84.9,82.6,80.0,89.6,91.0,92.0,94.1,93.1,94.4
1,El terrorismo. ETA,80.1,0.2,0.8,1.1,0.5,0.1,0.4,0.6,0.5,...,10.8,11.3,7.5,13.0,12.6,30.1,38.0,66.5,33.1,37.6
2,La corrupción y el fraude,63.8,50.8,48.5,55.5,60.0,63.8,42.3,42.7,41.5,...,27.2,33.5,31.6,17.6,7.6,2.5,-1.0,-1.0,-1.0,-1.0
3,Los problemas de índole económica,59.9,24.9,24.9,24.5,24.9,25.5,27.0,28.8,28.0,...,29.6,26.5,24.3,24.6,35.8,10.1,19.8,23.1,27.3,28.0
4,La inmigración,59.2,1.9,1.7,2.3,1.9,2.5,3.0,3.6,2.8,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,La vivienda,37.3,1.9,1.1,1.5,1.0,0.9,1.3,1.6,1.6,...,-1.0,-1.0,-1.0,2.6,5.4,1.7,1.9,3.2,2.2,2.1
6,"Las guerras en general (Balcanes, Irán, Afgani...",37.3,-1.0,0.1,-1.0,0.0,-1.0,0.0,0.0,0.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,"Los políticos en general, los partidos polític...",31.4,20.0,20.1,21.8,21.8,23.3,23.2,25.8,26.4,...,15.2,9.9,-1.0,7.3,2.8,1.6,5.7,3.2,5.8,4.7
8,La inseguridad ciudadana,29.1,2.2,2.2,2.6,3.4,1.7,2.9,3.2,2.5,...,12.7,10.3,16.5,10.8,17.3,36.3,30.0,30.9,28.0,32.9
9,El desastre del Prestige,26.4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


Obtenim el diccionari topic_CIS_dict on per a cada entrada (data) obtenim la llista de tòpics ordenada pel seu percentatge.

In [89]:
import datetime

otros_topic_list = [u'N.S.',u'Otras respuestas',u'N.C.',u'Ninguno']
topic_CIS_dict = {}
for idc, column in enumerate(df_CIS.columns):
    if idc > 1:
        cis_date =  datetime.datetime.strptime(column.decode('utf-8'), "%d/%m/%Y").strftime('%Y-%m-%d')
        otros_topic = [u'otros',0]
        topic_CIS_dict[cis_date] = []
        sum = 0
        for idr, row in df_CIS.iterrows():
            value = df_CIS.ix[idr,idc]
            if value > 0:
                sum += value
                if row.topic in otros_topic_list:
                    otros_topic[1] += value
                else:
                    topic_CIS_dict[cis_date].append([u"".join(row.topic.decode('utf-8')),value])

        if otros_topic[1] > 0:
            topic_CIS_dict[cis_date].append(otros_topic)
        # normalitzar
        for topic in topic_CIS_dict[cis_date]:
            topic[1] = topic[1]/sum
        # ordenar
        topic_CIS_dict[cis_date].sort(key=lambda col: col[1], reverse=True)

for cis_date in topic_CIS_dict:
    print cis_date, ":", len(topic_CIS_dict[cis_date]) #, topic_CIS_dict[cis_date]

2001-02-01 : 27
1995-01-09 : 12
2005-03-01 : 30
2003-09-01 : 31
2005-01-01 : 31
2001-01-01 : 27
1995-01-02 : 14
1995-01-01 : 12
1995-01-04 : 12
2010-04-01 : 27
2008-10-01 : 29
2002-11-01 : 30
2004-02-01 : 30
2013-10-01 : 33
1988-01-06 : 19
1997-01-03 : 11
2001-03-01 : 28
1997-01-04 : 12
2001-05-01 : 28
2006-04-01 : 30
2011-07-01 : 27
2010-07-01 : 26
2011-06-01 : 28
2011-04-01 : 28
1986-01-12 : 15
2014-04-01 : 35
2000-11-01 : 27
2015-01-01 : 36
2008-05-01 : 28
2013-06-01 : 36
2008-04-01 : 29
2010-03-01 : 28
2014-05-01 : 32
2005-02-01 : 30
2007-07-01 : 30
2004-03-01 : 30
2013-01-01 : 32
1998-01-07 : 20
2008-01-01 : 30
2011-12-01 : 28
2009-11-01 : 29
2001-04-01 : 28
2013-02-01 : 33
2006-05-01 : 29
2003-06-01 : 30
2012-02-01 : 27
2007-10-01 : 29
2002-04-01 : 30
2007-01-01 : 29
2010-06-01 : 27
2013-09-01 : 34
2003-03-01 : 31
2002-07-01 : 29
2012-03-01 : 29
2009-09-01 : 28
1986-01-09 : 15
2002-03-01 : 28
2000-10-01 : 26
1993-01-09 : 21
2009-12-01 : 27
2013-07-01 : 35
2015-02-01 : 35
2010-02-



### Diccionari (traductor) de tòpics CIS - Congrés

In [90]:
dict_cis = {
u'empleo':[u'empleo', u'problemas relacionados calidad empleo',u'laboral',u'reforma laboral',u'paro', u'empleos', u'acoso laboral', u'accidente laboral', u'accidentados laborales', u'accidentalidad laboral', u'accidentes laborales graves', u'accidentes laborales', u'prejubilacion', u'indole laboral', u'abusos laborales',u'accion sindical', u'desempleo', u'ambitos laborales', u'precariedad', u'contratacion', u'trabajadores', u'ocupacion', u'sindicato', u'trabajo', u'autonomo', u'autonomos', u'derechos laborales', u'salario minimo interprofesional',u'sueldo', u'salario', u'contrato temporal', u'contrato indefinido', u'contrato precario', u'parados', u'convenio laboral', u'subsidio', u'INEM', u'desempleados'],
u'salud':[u'crisis alimenticias(vacas locas, gripe aviar)',u'vih', u'salud', u'sanidad', u'accidentes vasculares', u'acciones sanitarias', u'areas clinicas',u'accion sanitaria', u'absentismo laboral',u'abuso llamado turismo sanitario', u'enfermedades', u'areas sanitarias', u'hospital', u'ambito sanitario', u'ebola', u'enfermedad', u'ambito hospitalario', u'ambito laboral', u'vacas locas', u'area sanitaria', u'contagio', u'gripe', u'vacunas', u'atencion primaria', u'medico', u'enfermero', u'sanitario', u'lista de espera', u'urgencias', u'medicina', u'jubilado', u'salmonela', u'hospitales', u'gripe aviar', u'medicos', u'ambulancias', u'enfermeros', u'vacuna', u'Organizacion Mundial de la Salud', u'medicamentos', u'OMS', u'enfermedad', u'enfermedades', u'calendario de vacunas'],
u'terrorismo':[u'terrorismo internacional (al qaeda, 11 s, 11 m, etc.)',u'terrorismo. eta', u'terrorismo', u'eta', u'al qaeda', u'terroristas', u'actuacion terrorista',u'acciones terroristas',u'acciones violentas', u'accion terrorista', u'isis', u'atentado', u'ambito terrorista', u'abandonen euskal herria', u'bomba', u'victimas', u'11m', u'11s', u'terror', u'secuestro', u'paz', u'desarme', u'tregua', u'conflicto armado', u'islamistas', u'inmolarse', u'terrorista', u'terroristas', u'estado islamico'],
u'jovenes':[u'jovenes',u'problemas relacionados ocio jovenes',u'problemas relacionados juventud', u'edad adulta', u'juventud', u'ocio', u'sexo', u'discoteca', u'balconing', u'ambito juvenil',u'problemas juventud'],
u'vivienda':[u'vivienda', u'hipoteca', u'hipotecas', u'vivienda',u'abusiva hipoteca', u'desahucio', u'indices ceca', u'ocupa', u'euribor', u'clausula suelo', u'desalojo', u'alquiler', u'vivienda proteccion oficial', u'alquiler social', u'desahucios', u'okupa', u'desalojos', u'alquileres sociales', u'ayuda al alquiler'],
u'economia':[u'economia',u'subida tarifas energeticas',u'subida iva',u'subida carburantes',u'recortes',u'problemas indole economica',u'situacion economica', u'pymes', u'exito economico', u'activacion economica', u'activos inmobiliarios', u'activacion socioeconomica',u'acontecimientos economicos',u'accionista unico', u'accionista español', u'accionista fundamental', u'accionista mayoritario', u'accionista sepi', u'accionista', u'accionistas españoles', u'accionistas individuales', u'accionistas mayoritarios', u'accionistas minoristas', u'accionistas minoritarios',u'aceleracion economica', u'accionistas norteamericanos', u'accionistas pagaron', u'accionistas perjudicados', u'accionistas principales',  u'accion economica', u'accesibilidad global', u'accesibilidad turistica',u'indole economica', u'abusivos margenes comerciales', u'abundantes recursos economicos', u'abismo economico', u'optica economica',u'organos empresariales', u'optica turistica', u'optica turistica', u'abandono presupuestario', u'indole presupuestaria', u'indole macroeconomica', u'indole comercial', u'indices economicos', u'exito economico reciente', u'indice irph', u'indices bursatiles', u'indice pmi', u'exitos comerciales', u'exitos empresariales', u'credito', u'consumo', u'area comerciales', u'areas economicas', u'zec', u'tasas', u'ambito empresarial', u'ambito turistico', u'animo recaudatorio', u'area contributiva', u'area economica', u'ambitos turisticos', 'ambitos socioeconomicos', u'ambito presupuestario', u'ambitos economicos', u'ambito productivo', u'ambito industrial', u'ambito macroeconomico', u'ambito financiero', u'comercial', u'ambito contributivo', u'ambito economico', u'ambito economico financiero', u'ambito bancario', u'ambito bilateral', u'pive', u'exportaciones', u'licitar', u'energetico', u'pives', u'mercado exterior', u'tributos', u'mercancias', u'financiero', u'exportador', u'industria', u'deficit', u'ingresos', u'amortizacion', u'hidrocarburos', u'empresarial', u'contribuyentes', u'energia', u'privatizacion', u'produccion', u'inversiones', u'mercado', u'reasignacion',u'turismo', u'regulacion', u'consumidores', u'economicos', u'industrial', u'exportacion', u'impulso', u'cuota', u'presupuesto', u'competitividad', u'compañia', u'comercio', u'recuperacion', u'turistico', u'ico', u'financiacion', u'importaciones', u'retenciones', u'competencia',u'economico', u'mercados', u'accionistas', u'desaceleracion', u'presupuestos', u'economia', u'euro',u'euro (influencia precios, conversion, etc.)', u'IVA', u'impuestos', u'pensiones', u'banco', u'renta', u'rescate', u'pib', u'producto interior bruto', u'impuesto', u'banco malo', u'bancos',u'subvenciones', u'subvenciones', u'sector primario', u'sector servicios', u'sector industrial', u'sector financiero', u'sector agricola', u'problemas agricultura, ganaderia pesca', u'sector turistico', u'bancarrota', u'quiebra', u'pyme', u'capita'],
u'medio ambiente':[u'problemas medioambientales',u'desastre prestige',u'trasvases', u'pac',u'actividad agraria', u'actividad agroalimentaria', u'acondicionando pasillos verdes', u'acondicionantes medioambientales',  u'atc',u'accion rural', u'abundante recursos pesqueros', u'abundante caudal', u'abundante cosecha', u'aberracion ambiental', u'abejas', u'abonos organicos', u'aberracion medioambiental', u'buque petrolero', u'areas marinas protegidas', u'areas medioambientales', u'areas inundables', u'areas naturales', u'ambito ambiental', u'ambito medioambiental' ,u'litoral',  u'vertidos', u'pesqueros', u'yacimientos', u'riadas', u'medio ambiental', u'palangreras', u'aguila', u'aguila imperial', 'aguila perdicera', u'aguilas', u'aguila imperial iberica', u'vertido', u'bobina', u'especies', u'caso Prestige', u'contaminacion', u'marinas', u'porcino', u'residuos', u'carbon', u'agrarios', u'eco', u'costa', u'costas', u'caladeros', u'caladero', u'selva', u'medio ambiente', u'zona protegida', u'humedales', u'coto', u'agricultura', u'ganaderia', u'naturaleza', u'pesca', u'ecologia', u'parque natural', u'reciclaje', u'sostenibilidad', u'emisiones', u'energias renovables', u'nuclear', u'solar', u'contaminacion', u'calentamiento global', u'ovina', u'incendio', u'inundacion', u'cultivos', u'cambio climatico', u'hundimiento', u'Prestige', u'petrolero', u'chapapote', u'ebro', u'eolico'],
u'educacion':[u'educacion',u'absentismo escolar',u'actividad academica', u'acoso escolar', u'acompañamiento escolar',u'acciones educativas novedosas', u'accion educadora', u'accion educativa española', u'accion educativa', u'exito educativo',u'abandono educativo intolerable', u'abismo educativo', u'abandono educativo temprano superior', u'abandono universitario', u'abandono educativo temprano', u'abandono educativo', u'abandono escolar prematuro', u'abandono escolar temprano', u'abandono escolar', u'indices formativos', u'organos docentes', u'exito escolar futuro', u'exito escolar', u'universidad',  u'area educativa', u'ambito educativo', u'ambito universitario',  u'ambitos academicos', u'ambito escolar', u'logse', u'leru', u'enseñanza', u'bolonia', u'estudiante', u'profesor',  u'profesores', u'informe pisa', u'bachillerato', u'becas', u'wert', u'selectividad', u'examen', u'abandono escolar', u'ambito academico', u'centro concertado', u'universidades', u'estudiantes', u'profesores', u'examenes', u'beca', u'beca Erasmus', u'erasmus', u'Seneca', u'revalida', u'tasa abandono escolar'],
u'servicios publicos e infraestructuras':[u'servicios publicos infraestructuras', u'funcionamiento servicios publicos', u'peit', u'acceso ferroviario', u'accidente aereo', u'accidentes marinos', u'accidentes ferroviarios', u'accidentes gravisimos', u'accidentes graves', u'accidentes importantes', u'accidentes aereos militares', u'accidentes aereos', u'accidentes catastroficos',  u'accidente desgraciado', u'accidente domestico', u'accidente ferroviario', u'accidente grandioso', u'accidente grave', u'accidente importantisimo', u'accidente importante', u'accesos ferroviarios', u'accesos portuarios', u'accesos viario', u'accesos viarios seguros', u'accesos viarios', u'pasajeros', u'academia general basica', u'academia general militar', u'academia militar',u'academias militares', u'ambito ferroviario',  u'transportes', u'buques', u'servicios publicos', u'infraestructuras', u'correos', u'ferrocarril', u'ave', u'aena', u'tren', u'alta velocidad', u'aeropuerto', u'puerto', u'transporte', u'autopista', u'peaje', u'cercanias', u'corredor mediterraneo', u'taxi', u'taxista', u'carretera', u'carreteras', u'aeropuertos', u'trenes'],
u'corrupcion':[u'corrupcion', u'fraude',u'fraude fiscal', u'corrupcion fraude', u'caja b', u'tesorero',u'abyecta corrupcion', u'fraudulento', u'fisco',  u'ambito sancionador', u'etica politica', u'etica profesional'],
u'inseguridad ciudadana':[u'inseguridad ciudadana',u'actividad delictiva',u'actos criminales', u'actos delictivos',  u'indice delincuencial', u'accion policial', u'accion criminal',u'abuso sexual', u'policia', u'antidisturbios', u'crimen', u'extorsion', u'delincuencia', u'crimen organizado', u'violacion', u'robos', u'robo', u'prostitucion', u'crimenes', u'violaciones'],
u'justicia':[u'excarcelaciones',u'administracion justicia',u'legislacion', u'fiscalia',u'actuacion judicial', u'actuacion justicia', u'acciones judiciales', u'optica fiscal',u'acciones legales', u'acciones penales', u'accion judicial', u'aberracion legal',u'ordenes judiciales',u'organos judiciales',u'organos jurisdiccionales',u'organos penales', u'organos legisladores', u'organo fiscalizador',u'organo judicial', u'organo jurisdiccional', u'organo legislativo',  u'indole constitucional', u'indole penal', u'abra juicio oral', u'indole fiscal', u'indole juridica',  u'justicia', u'ambitos judiciales españoles', u'ambitos judiciales internacionales', u'tribunales', u'ambito constitucional', u'ambito procesal', u'ambito prejudicial', u'ambito penal', u'area penal', u'criminales', u'ambito judicial', u'ambito juridico', u'ambito legal', u'ambito legislativo', u'ilegalidad', u'moratorias', u'delito', u'juridica', u'comparecencia', u'disposiciones', u'criminal', u'desestimacion', u'constitucion',u'ferroviario', u'juridico', u'leyes', u'detencion', u'jueces', u'interpelacion', u'tribunal', u'excarcelacion', u'juzgado', u'preso', u'abogado', u'fiscal', u'poder judicial', u'constitucional',u'judicial', u'supremo', u'presos', u'constitucionales', u'tribunal supremo', u'tribunal constitucional', u'delitos'],
u'ideologia':[u'monarquia', u'nacionalistas', u'nacionalismos (el estatuto cataluna, ...)', u'independencia',u'accion nacional', u'accion nacionalista vasca anv', u'accion nacionalista vasca', u'ordenes religiosas catolicas',u'ordenes religiosas', u'optica nacionalista', u'nacionalista', u'religion', u'nacionalismo', u'republica', u'extrema derecha', u'extrema izquierda', u'franquismo',u'ideologia', u'estatuto autonomia',u'nacionalismos'],
u'social':[u'social',u'ley aborto',u'problemas relacionados mujer',u'crisis valores',u'problemas indole social',u'violencia mujer',u'problemas sociales', u'areas sociales',u'accion social',u'acoso sexual', u'accesibilidad arquitectonica', u'accesibilidad equitativas', u'accesibilidad global arquitectonica', u'accesibilidad universal', u'accesibilidad urbana', u'accesibilidad', u'abuelos exiliados', u'abusos sexuales', u'optica sociologica', u'optica social', u'aborto autoinfligido', u'aborto clandestino', u'aborto espontaneo', u'aborto inseguro', u'aborto legal', u'aborto libre', u'aborto oficial', u'aborto propuestas', u'aborto retorne', u'aborto seguro', u'aborto voluntario', u'aborto', u'abortos anuales', u'abortos clandestinos', u'abortos ilegales', u'abortos inseguros', u'abortos legales', u'abortos practicados', u'abortos seguros', u'abortos',  u'exito social', u'indole social', u'area social', u'sociales', u'ambito social', u'elites sociales', u'emigrantes', u'cohesion social', u'libertades', u'familias', u'discriminacion', u'familiar', u'monoparentales', u'integracion', u'mujeres',  u'immigrantes', u'pobreza', u'igualdad', u'emigracion',u'parejas', u'plaza', u'problema social', u'comedores sociales', u'pobreza', u'marginacion', u'marginacion social', u'comedor social', u'drogas', u'droga', u'racismo', u'inmigracion', u'manifestacion', u'violencia domestica', u'violencia de genero', u'aborto', u'pobreza infantil', u'homosexualidad', u'gay', u'lesbiana', u'adopcion', u'matrimonio', u'divorcio', u'custodia', u'mujer', u'sociedad', u'violencia', u'repatriacion' ],
u'internacional': [u'embajada', 'paises', u'tratado',u'acciones diplomaticas correspondientes', u'indices europeos', u'organos europeos', u'organos internacionales',u'organo regulador europeo', u'cooperacion', u'franceses', u'ambito internacional', u'area  internacional' , u'area iberoamericana' , u'area latinoamericana' , u'area islamica', u'area internacional' , u'area otan' ,u'ambitos internacionales', u'ambito euro africano', u'ambito iberoamericano', u'ambito europeo', u'africa', u'ambito diplomatico', u'africa occidental subsahariana', u'africa subsahariana', u'africa occidental', u'africa negra', u'africa suboccidental', u'africa profunda', u'consulado', u'tratados', u'iberoamerica', u'internacional', u'internacionales', u'estados miembros', u'internacional', u'europa'],
u'tecnologia e investigacion': [u'tecnologia e investigacion', u'tecnologia investigacion',u'tecnologias', u'investigadores',u'cientifico', u'actividad cientifica española', u'actividad cientifica', u'elite cientifica', u'elites cientificas', u'areas tecnologicas', u'tecnologicos', u'ambito tecnologico', u'ambitos tecnologicos', u'tecnologia', u'ambito cientifico', u'cientificas', u'digitalizacion', u'investigacion', u'tecnologico', u'tecnologica', u'innovacion'],
u'cultura': [u'culturales', u'cultura', u'castellano',u'actividad artistica', u'actores', u'acontecimiento cultural', u'literatura', u'acciones culturales', 'ambito cultural', 'ambitos culturals', u'accion cultural'],
u'otros':[u'otros'],
u'fuerzas armadas':[u'fuerzas armadas'],
u'guerras':[u'guerras general (balcanes, iran, afganistan, etc.)',u'guerra',u'guerras'],
u'politica':[u'politica',u'politicos general, partidos politicos politica', u'partidos politicos',u'gobierno partidos politicos concretos'],
u'preocupaciones situaciones personales':[u'preocupaciones situaciones personales']
}

# print dict_cis

In [91]:
import unicodedata

# Mètode que treu els accents a un string (en format unicode)
def remove_accents(input_str):
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])

In [92]:
from nltk.corpus import stopwords

# Mètode per eliminar les paraules que pertanyen a un conjunt de paraules donat (custom_set_of_words) 
def remove_custom_set_of_words(tokenized_doc, custom_set_of_words):
    return [ word for word in tokenized_doc if word not in custom_set_of_words ]

def normalized_word(raw_word):
    cleaned_tokens = remove_custom_set_of_words(raw_word.lower().split(), stopwords.words('spanish'))
    return remove_accents(u" ".join(cleaned_tokens))
    

In [93]:
def get_topic_translated_list(foreign_topic_list):
    topic_dictionary = {}
    for response in foreign_topic_list:
        response_topic = ""
        for topic in dict_cis:
            norm_response = normalized_word(response[0])
            if (norm_response and norm_response in dict_cis[topic]) or response[0].lower() in dict_cis[topic]:
                response_topic = topic
                break;

        if response_topic:
            if response_topic in topic_dictionary:
                topic_dictionary[response_topic]['value'] += response[1]
            else:
                topic_dictionary[response_topic] = {'value':response[1]}
        else:
            print "WARN:", "'" + norm_response + "' | '" + response[0].lower() + "'", "not found in dict_cis"

    #
    topic_list = []
    for topic in topic_dictionary:
        topic_list.append([topic,topic_dictionary[topic]['value']])
    topic_list.sort(key=lambda tup: tup[1], reverse=True)
    return topic_list

In [94]:
def merge_list_v2(list1, list2, tol = 0):
    result = []

    for val1 in list1:
        if val1[1] > tol:
            result.append([val1[0],val1[1],0.0,val1[1]])
        
    for val2 in list2:
        if val2[1] > tol:
            val2_found = False
            for index, val in enumerate(result):
                # print val2, ":",val
                if val2[0]==val[0]:
                    sum_values = val[1]+val2[1]
                    value1 = val[1] #/sum_values
                    value2 = val2[1] #/sum_values                
                    result[index][1] = value1
                    result[index][2] = value2
                    result[index][3] = sum_values
                    val2_found = True
                    break;
            if not val2_found:
                result.append([val2[0],0.0,val2[1],val2[1]])
    
    # normalitzar
    sum_common_values = 0
    for topic in result:
        sum_common_values += topic[3]
    for topic in result:
        topic[3] = topic[3] / sum_common_values
    # ordenar
    result.sort(key=lambda tup: tup[3], reverse=True)
    
    return result

def merge_list(list1, list2, tol = 0):
    result = []

    for val1 in list1:
        if val1[1] > tol:
            result.append([val1[0],val1[1],0.0,val1[1]])
        
    for val2 in list2:
        if val2[1] > tol:
            val2_found = False
            for index, val in enumerate(result):
                # print val2, ":",val
                if val2[0]==val[0]:
                    sum_values = val[1]+val2[1]
                    value1 = val[1]/sum_values
                    value2 = val2[1]/sum_values                
                    result[index][1] = value1
                    result[index][2] = value2
                    result[index][3] = sum_values
                    val2_found = True
                    break;
            if not val2_found:
                result.append([val2[0],0.0,val2[1],val2[1]])
    
    # normalitzar
    sum_common_values = 0
    for topic in result:
        sum_common_values += topic[3]
    for topic in result:
        topic[3] = topic[3] / sum_common_values
    # ordenar
    result.sort(key=lambda tup: tup[3], reverse=True)
    
    return result


def intersect_lists(list1, list2):
    result = []
    
    for val1 in list1:
        for val2 in list2:
            if val1 == val2[0]:
                result.append(val1)
                break
    return result

'''
print "len(translated_topic_CIS_list): ", len(translated_topic_CIS_list)
print "-"*120
print "len(translated_topic_congres_list)", len(translated_topic_congres_list)
print "-"*120
common_topic_list = merge_list(translated_topic_congres_list, translated_topic_CIS_list)
print common_topic_list
'''

'\nprint "len(translated_topic_CIS_list): ", len(translated_topic_CIS_list)\nprint "-"*120\nprint "len(translated_topic_congres_list)", len(translated_topic_congres_list)\nprint "-"*120\ncommon_topic_list = merge_list(translated_topic_congres_list, translated_topic_CIS_list)\nprint common_topic_list\n'

Traduïm les llistes de tòpics i fem "merge" de cada parell de llistes Congrés-CIS per a la seva visualització:

In [95]:
import datetime

def column(matrix, i):
    return [row[i] for row in matrix]

congres_start_date = datetime.date(2000,9,1)
congres_end_date = datetime.date(2015,4,1)

common_topic_list = []
all_merged_topic_list = []
merged_topic_dict = {}
for congres_date in topic_congres_dict:
    d_congres_date = datetime.datetime.strptime(congres_date, "%Y-%m-%d").date()
    if congres_start_date <=d_congres_date <= congres_end_date:
        if congres_date in topic_CIS_dict:
            print "-"*120
            print congres_date
            topic_congres_dict_translated = get_topic_translated_list(topic_congres_dict[congres_date])
            topic_CIS_dict_translated = get_topic_translated_list(topic_CIS_dict[congres_date])
            print len(topic_congres_dict_translated), ":", len(topic_CIS_dict_translated)
            if len(topic_congres_dict_translated) > 0 and len(topic_CIS_dict_translated) > 0:
                # 0: topic name, 1: congres value, cis value, 2: (congres + cis)/total_month
                # fer servir merge_list_v2 per 'nations' i merge_list per bubbles
                # merged_topic_dict[congres_date] = merge_list(topic_congres_dict_translated, topic_CIS_dict_translated)
                merged_topic_dict[congres_date] = merge_list_v2(topic_congres_dict_translated, topic_CIS_dict_translated)
                print "Merged ->", len(merged_topic_dict[congres_date])
                for topic in merged_topic_dict[congres_date]:
                    try:
                        topic_index = column(all_merged_topic_list,0).index(topic[0])
                        all_merged_topic_list[topic_index][1] = max(all_merged_topic_list[topic_index][1],topic[3])
                    except ValueError:
                        all_merged_topic_list.append([topic[0],topic[3]])
                        
                #
                if len(common_topic_list) > 0:
                    common_topic_list = intersect_lists(common_topic_list,merged_topic_dict[congres_date])
                else:
                    common_topic_list = column(merged_topic_dict[congres_date],0)
            else:
                print "Not Merged."
        else:
            print "-"*120
            print "WARN:", congres_date, "not in topic_CIS_dict"


------------------------------------------------------------------------------------------------------------------------
2001-02-01
16 : 16
Merged -> 21
------------------------------------------------------------------------------------------------------------------------
2005-03-01
15 : 17
Merged -> 20
------------------------------------------------------------------------------------------------------------------------
2001-03-01
15 : 16
Merged -> 20
------------------------------------------------------------------------------------------------------------------------
2005-01-01
15 : 17
Merged -> 19
------------------------------------------------------------------------------------------------------------------------
2001-01-01
17 : 16
Merged -> 21
------------------------------------------------------------------------------------------------------------------------
2010-04-01
15 : 17
Merged -> 19
----------------------------------------------------------------------------------

In [96]:
print "-"*120
print "len(all_merged_topic_list):", len(all_merged_topic_list)
print "-"*120
print "len(common_topic_list):", len(common_topic_list)
print "-"*120
# ordenar
all_merged_topic_list.sort(key=lambda tup: tup[1], reverse=True)
print all_merged_topic_list
print "-"*120
print common_topic_list

common_topic_value_list = []
for topic in all_merged_topic_list:
    if topic[0] in common_topic_list:
        common_topic_value_list.append(topic)
print "-"*120      
print common_topic_value_list

------------------------------------------------------------------------------------------------------------------------
len(all_merged_topic_list): 22
------------------------------------------------------------------------------------------------------------------------
len(common_topic_list): 14
------------------------------------------------------------------------------------------------------------------------
[[u'empleo', 0.34389569549829468], [u'economia', 0.29984262506809861], [u'otros', 0.26451383691044716], [u'corrupcion', 0.22860534622432194], [u'internacional', 0.19209166479456677], [u'terrorismo', 0.18326687683111434], [u'social', 0.16226496737677584], [u'salud', 0.14229014482408484], [u'inseguridad ciudadana', 0.1417407621566108], [u'medio ambiente', 0.13994648200910648], [u'jovenes', 0.13779073053205945], [u'justicia', 0.12189578557773721], [u'servicios publicos e infraestructuras', 0.10324426808177009], [u'educacion', 0.085582237363205194], [u'guerras', 0.080146110872

In [97]:
save_dict_json(merged_topic_dict, 'merged_topic_dict.json')

In [98]:
loaded_merged_topic_dict = load_dict_json('merged_topic_dict.json')
# print loaded_common_topic_list

In [101]:
import copy

common_topic_dict = copy.deepcopy(merged_topic_dict)

print common_topic_list
print "-"*120

# Eliminem aquells topics que no apareguin a les N primeres posicions de la common_topic_value_list i 'otros'
if False:
    N = 14
    common_topic_list = [c_topic[0] for c_topic in common_topic_value_list[:N]]
    for date in common_topic_dict:
        common_topic_date_list = []
        sum_date_CIS = 0
        sum_date_Congres = 0
        sum_date_Total = 0
        for topic in common_topic_dict[date]:
            if topic[0] in common_topic_list and topic[0] != 'otros':
                sum_date_Congres += topic[1]
                sum_date_CIS += topic[2]
                sum_date_Total += topic[3]
                common_topic_date_list.append(topic)
        '''
        # Normalize
        for topic in common_topic_date_list:
            topic[1] = topic[1] / sum_date_Congres
            topic[2] = topic[2] / sum_date_CIS
            topic[3] = topic[3] / sum_date_Total
        '''        
        common_topic_dict[date] = common_topic_date_list
    # print common_topic_dict
else:
    for date in common_topic_dict:
        common_topic_date_list = []
        sum_date_CIS = 0
        sum_date_Congres = 0
        sum_date_Total = 0
        for topic in common_topic_dict[date]:
            if topic[0] != 'otros':
                common_topic_date_list.append(topic)
        common_topic_dict[date] = common_topic_date_list
    # print common_topic_dict

congres_start_date = datetime.date(2000,9,1)
congres_end_date = datetime.date(2015,3,11)
print (congres_end_date-congres_start_date).days

income_factor = 1
life_expectancy = 1
population = 1
common_nations_dict = {}
for congres_date in common_topic_dict:
    d_congres_date = datetime.datetime.strptime(congres_date, "%Y-%m-%d").date()
    congres_days = (d_congres_date-congres_start_date).days
    for topic in common_topic_dict[congres_date]:
        # print topic
        if topic[0] in common_nations_dict:
            # print "A", topic[0], "income:",topic[1], "lifeExpectancy:",topic[2],"population:",topic[3]
            # income: CIS, lifeExpectancy: Congres
            common_nations_dict[topic[0]]["income"].append([congres_days,topic[2]*income_factor])
            common_nations_dict[topic[0]]["lifeExpectancy"].append([congres_days,topic[1]*life_expectancy])
            common_nations_dict[topic[0]]["population"].append([congres_days,topic[3]*population])
        else:
            # print "C", topic[0], "income:",topic[1], "lifeExpectancy:",topic[2],"population:",topic[3]
            # income: CIS, lifeExpectancy: Congres
            common_nations_dict[topic[0]] = {"income":[[congres_days,topic[2]*income_factor]],
                                             "lifeExpectancy":[[congres_days,topic[1]*life_expectancy]],
                                             "population":[[congres_days,topic[3]*population]]}
    
# print common_nations_dict

common_nations_list = []
for topic in common_nations_dict:
    if topic == 'empleo' or True:
        common_nations_dict[topic]["income"].sort(key=lambda col: col[0], reverse=False)
        common_nations_dict[topic]["lifeExpectancy"].sort(key=lambda col: col[0], reverse=False)
        common_nations_dict[topic]["population"].sort(key=lambda col: col[0], reverse=False)
        common_nations_list.append({"name":topic,"region":topic,
                                    "income":common_nations_dict[topic]["income"],
                                    "lifeExpectancy":common_nations_dict[topic]["lifeExpectancy"],
                                    "population":common_nations_dict[topic]["population"]})

print common_nations_list
save_dict_json(common_nations_list, 'common_nations_list.json')

[u'empleo', u'economia', u'otros', u'terrorismo', u'social', u'salud', u'inseguridad ciudadana', u'medio ambiente', u'jovenes', u'justicia', u'servicios publicos e infraestructuras', u'educacion', u'vivienda', u'politica']
------------------------------------------------------------------------------------------------------------------------
5304
[{'region': u'justicia', 'population': [[0, 0.0026920490967096977], [30, 0.020657823600288373], [61, 0.023604286950405624], [91, 0.026082545319083224], [122, 0.027983270632014265], [153, 0.025734637963488547], [181, 0.0091178598641557095], [212, 0.037948571838777145], [242, 0.0048287037057872474], [273, 0.040129461476214309], [303, 0.03461074902041477], [365, 0.030338815479758925], [426, 0.0036731944041847185], [456, 0.011765336689485414], [487, 0.011663245169180702], [518, 0.0097449867915472103], [546, 0.0042900042900020181], [577, 0.0086652341466120063], [607, 0.015126790827719403], [638, 0.013521687563540399], [668, 0.011201748990073084], [

In [None]:
loaded_common_nations_list = load_dict_json('common_nations_list.json')

### Visualització

In [1]:
%%html
<style>
    text {
        font: sans-serif;
    }
</style>

<div id="my_first_bubble_chart"></div>

In [2]:
%%javascript
    // We load the d3.js library from the Web.
    require.config({paths: {d3: "http://d3js.org/d3.v3.min"}});
    require(["d3"], function(d3) {
 
    var diameter = 960,
        format = d3.format(",d"),
        // https://github.com/mbostock/d3/wiki/Ordinal-Scales#categorical-colors
        color = d3.scale.category20c();
        
    var pie = d3.layout.pie()
        .sort(null)
        .value(function(d) { return d.population; });

    var bubble = d3.layout.pack()
        .sort(null)
        .size([diameter, diameter])
        .padding(1.5);

    var svg = d3.select("#my_first_bubble_chart").append("svg")
        .attr("width", diameter)
        .attr("height", diameter)
        .attr("class", "bubble");

    d3.json("common_topic_list.json", function(data) {
      var node = svg.selectAll(".node")
          .data(bubble.nodes(processData(data))
          .filter(function(d) { return !d.children; }))
        .enter().append("g")
          .attr("class", "node")
          .attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; });

      node.append("title")
          .text(function(d) { 
                  var result = d.className + ": (" + (100.*d.value).toFixed(2) + "%)";
                  var sum = d.k1 + d.k2;
                  if (d.k1 > 0) {
                      result += "\nCongrés: (" + (100.*d.k1/sum).toFixed(2) + "%)";
                  }
                  if (d.k2 > 0) {
                      result += "\nCIS: (" + (100.*d.k2/sum).toFixed(2) + "%)";
                  }
                  return result; 
                });

      node.append("circle")
          .attr("r", function(d) { return d.r; })
          .attr("stroke", function (d) { return "black" })
          .attr("stroke-width", "1")
          .style("fill", function(d) { return d.k2 > d.k1 ? "white" : "steelblue"; });

      node.append("circle")
          .attr("r", function(d) {
                        var result = 0;
                        if (d.k1 + d.k2 > 0.99) {
                            if (d.k1>d.k2) {
                                result = d.r * Math.sqrt(d.k2);
                            }
                            else {
                                result = d.r * Math.sqrt(d.k1);
                            }
                        }
                        return result; 
                      })
          .attr("stroke", function (d) { return "black" })
          .attr("stroke-width", "1")
          .style("fill", function(d) { return d.k2 > d.k1 ? "steelblue" : "white"; });
    
      node.append("text")
          .attr("dy", ".3em")
          .style("text-anchor", "middle")
          .style("font-size", function(d) { return Math.round(d.r/4)+"px"; })
          .text(function(d) { return d.className; })
          //.call(wrap, function(d) { return Math.round(d.r/4); }, function(d) { return d.r / 2; });
          .call(wrap, 10, 60);

    });

    function processData(data) { 

        var newDataSet = [];
        for (var i = 0; i < data.length; i++) { 
            // k1 -> congres
            // k2 -> CIS
            newDataSet.push({packageName: "Topics", className: data[i][0], value: data[i][3], k1:data[i][1],k2:data[i][2]});
        }
        return {children: newDataSet};
    }
    
    // Wrapping Long Labels
    // http://bl.ocks.org/mbostock/7555321
    function wrap(text, fontSize, width) {
        text.each(function() {
            var text = d3.select(this),
                words = text.text().split(/\s+/).reverse(),
                word,
                line = [],
                lineNumber = 0,
                lineHeight = fontSize/20., // 1.1 // ems
                y = text.attr("y"),
                dy = parseFloat(text.attr("dy")),
                tspan = text.text(null).append("tspan").attr("x", 0).attr("y", y).attr("dy", dy + "em");
            while (word = words.pop()) {
              line.push(word);
              tspan.text(line.join(" "));
              
              // alert(tspan.node().getComputedTextLength()*fontSize + " > " + width)
              
              if (tspan.node().getComputedTextLength()*fontSize > width) {
                line.pop();
                tspan.text(line.join(" "));
                line = [word];
                tspan = text.append("tspan").attr("x", 0).attr("y", y).attr("dy", ++lineNumber * lineHeight + dy + "em").text(word);
              }
            }
        });
    }
        
    d3.select(self.frameElement).style("height", diameter + "px");
});

<IPython.core.display.Javascript object>