# LACLICHEV


In [None]:
#Python
import datetime
import re
import pandas as pd
import random
import os

#Data Explorer
from tools.dtExplorer import DTExplorer

#Extractor
from dataExtractors.theGuardianExtractor import TheGuardianExtractor

#Indexer
from dataIndexer.indexer import Indexer

#DB
from dataDB.dbHandler    import DBHandler
from dataDB.dbDocument   import CDocument

#Plotly
import plotly.offline        as     pOff
import plotly.figure_factory as     ff
import plotly.graph_objs     as     go
from   plotly.graph_objs     import Scatter, Layout

#GMaps
import gmaps

# Geocoding
from dataEnhancer.geocode import Geocode

# ipywidgets
import ipywidgets as widgets

# IPhython
from IPython.display import clear_output

In [None]:
# Initialize DTExplorer HTML
DTExplorer.HideCodeCells()

In [None]:
# Create a database for the obtained data and the user query
archivedDB = DBHandler('ArchivedDB', 'mongo', 27017)
queryDB    = DBHandler('QueryDB', 'mongo', 27017)
curatedDB  = DBHandler('CuratedDB', 'mongo', 27017)


# Initialize plotly in offline mode
pOff.init_notebook_mode(connected=True)

# Configure GMaps
gmaps.configure(api_key="AIzaSyCrNWI9q6ZK1B6w2ePE3Ix-IGfIGKxBCkU")

In [None]:
coutry = "/opt/conda/lib/python3.6/site-packages/tools/countryCodes.json"
df     = pd.read_json(coutry, orient='records')

In [None]:
def MapDisplay(switch):
    if switch['new'] == 'Show Map':
        data = [ dict(
                 type = 'choropleth',
                 locations = df['Name'],
                 locationmode = "country names",
                 z = [(ord(c[0]) * 10) + ord(c[1]) for c in df['Cont'].tolist()],
                 text = df['Name'],
                 hoverinfo='text+location',
                 showscale=False,) ]

        layout = dict(
            title = 'Contry Codes',
            geo = dict(
                showframe = False,
                showcoastlines = False
            )
        )
        fig = dict( data=data, layout=layout )
        pOff.iplot(fig, validate=False, filename='d3-world-map' )
    else:
        clear_output()

In [None]:
continentCode = {'North America':'NA',
                 'South America':'SA',
                 'Europe':'EU',
                 'Asia':'AS',
                 'Africa':'AF',
                 'Oceania':'OC'}

regionSwitch = widgets.ToggleButtons(
    options      = ['North America', 'South America', 'Europe', 'Asia', 'Africa', 'Oceania'],
    disabled     = False,
    button_style = 'warning',
    tooltips     = ['NA', 'SA', 'EU', 'AS', 'AF', 'OC']
)

showMapBttn = widgets.ToggleButtons(
    options      = ['Hide Map', 'Show Map'],
    disabled     = False,
    button_style = 'info'
)
showMapBttn.observe(MapDisplay, names='value')
widgets.VBox([regionSwitch, showMapBttn])

## User's search query

*Request content containing this free text.*

*Supports AND(&), OR(|) and NOT(!) operators, and exact phrase queries.*

*e.g. storm, heavy storm, snow & (rain | storms), storm & ! snow*

In [None]:
userInput = ''
while userInput == '':
    userInput = input('What are you searching? ')

In [None]:
theGuardian        = TheGuardianExtractor(userInput)
theGuardianContent = theGuardian.getContent()

#Save the query to QueryDB
queryDoc = { "query":theGuardian.getQuery(),
             "date":datetime.datetime.utcnow(),
             "articlesSize": len(theGuardianContent),
             "keys": theGuardian.getKeywords()
           }
queryDB.Insert(queryDoc)

### Indexing documents

In [None]:
documentIndexer = Indexer(debug=True)
documentIndexer.IndexDocs(theGuardianContent)

## Frequency Matrix

Generate the frequency matrix of all the documents retrieved from the latest search

In [None]:
# Global Variables
matrix = {};

#Widgets
colMin = widgets.BoundedIntText(
    value       = 0,
    min         = 0,
    max         = 0,
    step        = 1,
    description = 'Min: 0',
    disabled    = False
)

colMax = widgets.BoundedIntText(
    value       = 1,
    min         = 1,
    max         = 1,
    step        = 1,
    description = 'Max: 0',
    disabled    = False
)

rowMin = widgets.BoundedIntText(
    value       = 1,
    min         = 1,
    max         = 1,
    step        = 1,
    description = 'Min: 1',
    disabled    = False
)

rowMax = widgets.BoundedIntText(
    value       = 2,
    min         = 2,
    max         = 2,
    step        = 1,
    description = 'Max: 2',
    disabled    = False
)

showButton = widgets.Button(
    description  = 'Show Table',
    disabled     = False,
    button_style = 'info',
    tooltip      = 'Click me',
    icon         = ''
)

showHeatmap = widgets.Button(
    description  = 'Show Heat Map',
    disabled     = False,
    button_style = 'danger',
    tooltip      = 'Click me',
    icon         = ''
)

generateMatrix = widgets.Button(
    description  = 'Generate Matrix',
    disabled     = False,
    button_style = 'danger',
    tooltip      = 'Matrix generation may take a while',
    icon         = ''
)

clearMatrix = widgets.Button(
    description  = 'Clear Matrix',
    disabled     = True,
    button_style = 'info',
    tooltip      = 'Click me',
    icon         = ''
)

boxWidget = widgets.VBox()
outM      = widgets.Output()

In [None]:
def ShowMatrixTable(b):
    clear_output()
    if (colMin.value < colMax.value) and (rowMin.value < rowMax.value):
        columnStart = colMin.value + 1
        columnEnd   = colMax.value + 2
        #Include Document Index
        sliceM  = [[matrix[0][0]] + matrix[0][columnStart:columnEnd]]
        for i in range(rowMin.value, rowMax.value):
            row =  [matrix[i][0]] + matrix[i][columnStart:columnEnd]
            sliceM.append(row)
        #Generate Table Plot
        table = ff.create_table(sliceM, index=True, index_title='Term', height_constant=20)
        pOff.iplot(table, filename='FreqMtrx')
    else:
        print("Wrong Table Range")
        
def ShowHeatmap(b):
    clear_output()
    if (colMin.value < colMax.value) and (rowMin.value < rowMax.value):
        columnStart = colMin.value + 1
        columnEnd   = colMax.value + 2
        
        xAxis = matrix[0][columnStart:columnEnd]
        yAxis = []
        zAxis = []
        for i in range(rowMin.value, rowMax.value):
            yAxis.append('‌‌{0}'.format(matrix[i][0]))
            zAxis.append(matrix[i][columnStart:columnEnd])

        trace = go.Heatmap(z=zAxis, x=xAxis, y=yAxis)
        data  = [trace]
        pOff.iplot(data, filename='heatmap')  
    else:
        print("Wrong Table Range")

def ClearWidgets(b):
    boxWidget.close()
    clear_output()
    with outM:
        clear_output()
    generateMatrix.disabled = False
    clearMatrix.disabled    = True
    
def UpdateShowWidgets():
    global boxWidget
    #Update Widgets
    colMin.max         = len(matrix[0]) - 3
    colMax.max         = len(matrix[0]) - 2
    colMax.description = 'Max: {0}'.format(len(matrix[0]) - 2)
    rowMin.max         = len(matrix) - 1
    rowMax.max         = len(matrix)
    rowMax.description = 'Max: {0}'.format(len(matrix))
    
    #Configure Button Widgets
    showButton.on_click(ShowMatrixTable)
    showHeatmap.on_click(ShowHeatmap)
    
    #Configure Accordion and display it
    accordion     = widgets.Accordion(children=[widgets.HBox([colMin, colMax]), widgets.HBox([rowMin, rowMax])])
    accordion.selected_index = None
    accordion.set_title(0, 'Column Matrix Range')
    accordion.set_title(1, 'Row Matrix Range')
    boxWidget = widgets.VBox([accordion, widgets.HBox([showHeatmap, showButton])])
    display(boxWidget)
    clearMatrix.disabled = False
            
def DisplayMatrix(b):
    global matrix
    ClearWidgets(True)
    
    generateMatrix.disabled = True
    with outM:
        matrix = documentIndexer.FreqMatrix(scattered=True)
    UpdateShowWidgets()

In [None]:
generateMatrix.on_click(DisplayMatrix)
clearMatrix.on_click(ClearWidgets)
display(widgets.HBox([generateMatrix, clearMatrix]), outM)

## Top 10
Cosine Similarity

In [None]:
freqMatrix = documentIndexer.FreqMatrix(byTerms=False)
top10      = documentIndexer.GetSimilarity(userInput, freqMatrix)[:10]

## Explore Content

In [None]:
#Global Widgets
citiesAcc   = widgets.Accordion()
formAcc     = widgets.Accordion()
outG        = widgets.Output()
mapFig      = gmaps.figure()
showCitites = widgets.Button(description='Show Cities', disabled=False, button_style='warning')
submit      = widgets.Button(description='Submit Document', disabled=False, button_style='info')
#Global Variables
features   = {}
citiesWdgt = []
newGPEWdgt = None

def ExploreContent(docID):
    global citiesWdgt
    global newGPEWdgt
    global citiesAcc
    global formAcc
    global outG
    global mapFig
    global showCitites
    global features
    #Close Widgets
    citiesAcc.close()
    formAcc.close()
    outG.close()
    mapFig.close()
    
    i          = 0
    citiesWdgt = []   
    features   = {}
    explorer   = DTExplorer()
    content    = documentIndexer.GetDocField(docSlider.value)
    display(explorer.Parse(content))
    cities = explorer.GetNamedEntities()

    for city in cities:
        cityWdgt = widgets.Text(
            value=city,
            description='GPE[{0}]: '.format(i),
            disabled=False
        )
        citiesWdgt.append(cityWdgt)
        i += 1

    newGPEWdgt = widgets.Text()
    citiesAcc  = widgets.Accordion(children=[widgets.VBox(citiesWdgt),
                                             widgets.VBox([widgets.HTML('GPEs as a comma-separated list'), newGPEWdgt])])
    citiesAcc.selected_index = None
    citiesAcc.set_title(0, 'Geo-Political Entities Found')
    citiesAcc.set_title(1, 'Add Geo-Political Entities')
    display(citiesAcc)

In [None]:
docList = []
for item in top10:
    docList.append(int(item[0]))

docSlider = widgets.SelectionSlider(
    options=docList,
    value=docList[0],
    description='Select Document: ',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True
)

contentWidget = widgets.interactive(ExploreContent, docID=docSlider)
contentWidget

## Geopolical Entities

Search the geopolitical entities found at the document and display them.

In [None]:
def ShowCities(b):
    global outG
    global mapFig
    global formAcc
    global showCitites
    global features
    
    outG.close()
    mapFig.close()
    formAcc.close()
    
    gpeList    = {}
    geolocator = Geocode()
    searchGPE  = []
    outG       = widgets.Output()
    bar        = widgets.IntProgress(value=0, min=0, max=1, description='Searching:')

    #Update GPEs
    for cityWdgt in citiesWdgt:
        if cityWdgt.value:
            searchGPE.append(cityWdgt.value)

    if newGPEWdgt.value:
        searchGPE += [gpe for gpe in re.split(', |,', newGPEWdgt.value)]

    bar.max = len(searchGPE)
    display(bar, outG)
    for gpe in searchGPE:
        bar.value += 1
        location = geolocator.GetGPE(gpe, continentCode[regionSwitch.value])
        if location:
            gpeList.update(location)
        else:
            with outG:
                print('✗.- GPE: "{0}" not found'.format(gpe), flush=True)

    bar.close()
    features = geolocator.GetFeatureCollection(gpeList)
    if len(features['features']) > 0 :
        DisplayMap(features)
        DisplayForm(features)

def DisplayMap(features):
    global mapFig
    mapFig.close()
    
    #Obtain GeoJSON Features
    infoBoxTemplate = """ <dl> <dt>{name}</dt> <dd>{location}</dd> <dt>Location</dt><dd>{latitude}, {longitude}</dd> </dl>"""
    markers         = []
    markersInfo     = []
    cityLabels      = []
    index           = 0
    for feature in features['features']:
        markers.append((feature['properties']['latitude'], feature['properties']['longitude']))
        markersInfo.append(infoBoxTemplate.format(**feature['properties']))
        cityLabels.append(feature['properties']['name'])
        with outG:
            print("{0}.- {1} - {2}".format(len(cityLabels), feature['properties']['name'], feature['properties']['location']), flush=True)
        
    mapFig      = gmaps.figure()
    markerLayer = gmaps.marker_layer(markers, hover_text=cityLabels, info_box_content=markersInfo)
    mapFig.add_layer(markerLayer)
    display(mapFig)

def DisplayForm(features):
    global formAcc
    formAcc.close()
    #Information Tab
    docName = widgets.Text(description='Name: ', value=documentIndexer.GetDocField(docSlider.value, Indexer.NAME),      layout=Layout(width='75%'))
    docDate = widgets.Text(description='Date: ', value=documentIndexer.GetDocField(docSlider.value, Indexer.DATE)[:10], layout=Layout(width='75%'))
    docUrl  = widgets.Text(description='URL:  ', value=documentIndexer.GetDocField(docSlider.value, Indexer.URL),       layout=Layout(width='75%'), disabled=True)
    #GPEs Tab
    i = 0
    featuresWdgt = []
    for feature in features['features']:
        featureWdgt = widgets.Text(value=feature['properties']['location'], description='GPE[{0}]: '.format(i), layout=Layout(width='75%'), disabled=True)
        featuresWdgt.append(featureWdgt)
        i += 1
    #Tags Tab
    tags = documentIndexer.GetDocField(docSlider.value, Indexer.TAGS).replace('|', ', ')
    docTags = widgets.Text(value=tags, layout=Layout(width='75%'))
    docTags = widgets.VBox([widgets.HTML('Tags as a comma-separated list'), docTags])
    #Tab Widget
    infoBox = widgets.VBox([docName, docDate, docUrl])
    tabNest = widgets.Tab(children=[infoBox, widgets.VBox(featuresWdgt), docTags])
    
    #Edit Tab Titles
    tabNest.set_title(0, 'Information')
    tabNest.set_title(1, 'Geo-Political Entities')
    tabNest.set_title(2, 'Tags')
    
    formAcc = widgets.Accordion(children=[widgets.VBox([tabNest, submit])])
    formAcc.selected_index = None
    formAcc.set_title(0, 'LACLICHEV Form')
    display(formAcc)
    
def submitDoc(b):
    # Document Information
    title   = formAcc.children[0].children[0].children[0].children[0].value
    url     = formAcc.children[0].children[0].children[0].children[2].value
    date    = formAcc.children[0].children[0].children[0].children[1].value
    content = documentIndexer.GetDocField(docSlider.value)

    #Tags
    tagsInfo  = [tag.strip() for tag in re.split(', {0,}', formAcc.children[0].children[0].children[2].children[1].value)]
    tagsQuery = re.split('\W+', queryDoc['query'].strip().replace('"', ''))

    # Geo-Political Entities
    gpeInfo   = features

    #Create the Curated Document and Insert it
    doc = CDocument(title, url, date, content, tagsInfo, tagsQuery, gpeInfo, ['a','b'])
    curatedDB.Insert(doc.dictDump())

In [None]:
submit.on_click(submitDoc)
showCitites.on_click(ShowCities)
showCitites

In [None]:
%%HTML

<a href="#Explore-Content" class="jupyter-widgets jupyter-button widget-button mod-info" style="text-decoration: none;">Keep exploring</a>