In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import spacy
from datetime import datetime, timedelta
import psycopg2
import ast
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

In [3]:
from idetect.interpreter import Interpreter

In [4]:
from idetect.load_data import load_countries, load_terms

In [5]:
from idetect.model import Base, Session, Analysis, db_url, Location, Country, Fact, Document, Analysis, DocumentType, Status
from sqlalchemy import create_engine

In [6]:
engine = create_engine(db_url())
Session.configure(bind=engine)

In [7]:
Base.metadata.drop_all(engine)

In [8]:
Base.metadata.create_all(engine)

In [9]:
session = Session()

In [10]:
load_countries(session)

In [11]:
load_terms(session)

In [14]:
nlp = spacy.load("en_default")

In [15]:
interpreter = Interpreter(session, nlp)

In [16]:
content = """South Africa is arresting a growing number of asylum seekers when they attempt to renew their asylum permits. Mxolisi Ncube speaks to Zimbabweans sent home and the human rights groups challenging the deportations.

PRETORIA, South Africa – Sheila* woke at 3 a.m. on a cold Tuesday morning. After hitchhiking three rides from Johannesburg, she completed the 38-mile journey to Pretoria and arrived at the Desmond Tutu Refugee Reception Office.

Joining a queue of fellow migrants, the 38-year-old from Zimbabwe realized there were no more than 75 people ahead of her, which meant she stood a good chance of submitting her asylum seeker’s permit for a renewal stamp.

Three hours later – after pushing and shoving in the crowd, and bribing a security guard to hold her spot in a line that had grown to more than 500 people – she finally managed to submit her permit.

About 10 a.m., a woman came out of the office holding a pile of papers and started calling names. Those called were ushered to a South African police truck parked at the corner of the yard.

Without explanation, Sheila and about 45 other migrants were crammed into the truck and taken to the Lindela Repatriation Center, where they were told they had been arrested because their applications for refugee status had been rejected. She was detained at Lindela for four months, then deported to Beitbridge, Zimbabwe, just across the border from northern South Africa.
Growing Arrests

More than a million people have sought asylum in South Africa since 2006, according to the country’s Department of Home Affairs (DHA). Most come from Zimbabwe, while others are from Nigeria, Ethiopia, Mozambique and other African nations, as well as countries such as India and Pakistan.

Amid a backlog of asylum applications, an increasing number of asylum seekers have been deported in recent months, often without due process, say South African human rights groups.

Between 50 and 150 people are arrested each day when they attempt to renew their permits, according to estimates by rights groups. They are detained at Lindela and eventually deported to their home country.

“We are concerned about the possibilities of abuse of process,” said Sharon Ekambaram, manager of Lawyers for Human Rights’ Refugee and Migrant Rights Project. In many cases, asylum seekers do not receive written decisions of final rejection, but are merely advised verbally and given a notice to appear before the immigration inspectorate for deportation, she said.

Asylum seeker permits – also known as Section 22 permits – are valid for six months and make it legal for people to stay in South Africa pending a decision on their asylum application. The system calls for several rounds of reviews before an asylum seeker can be rejected and deported.

Before the permit expires, the asylum seeker is interviewed by a refugee status determination officer, who makes a ruling – either granting asylum, rejecting the application or referring questions of law to the Standing Committee for Refugee Affairs (SCRA). In case of rejection, an asylum seeker can appeal within 30 days. The Refugee Appeal Board then conducts another hearing before deciding whether to confirm, set aside or substitute the initial decision.

Observers claim, however, that this process is not being followed.

“We have instances where the SCRA had not made a decision, despite the contrary being communicated to the client,” Ekambaram said. “We have requested written decisions signed by the SCRA, and while we have noted a few receiving some, it appears not to be widely used.”

The African Diaspora Forum, a nongovernmental organization established in 2008 to safeguard the rights of migrants in South Africa, said it had not held a dialogue with the DHA about the issue since 2016. DHA officials “are not ready to be engaged on the matter,” said African Diaspora Forum Chairman Marc Gbaffou, an Ivorian refugee who lives in South Africa.

Gbaffou said many migrants are held for 120 days or more in Lindela, beyond the legal maximum of 90 days. “Most of these would have had their permits expired because they were denied a chance to renew them by the guards and officials wanting to deal with a certain number,” he said.
Asylum System in Crisis

The Zimbabwe Exiles Forum, a nonprofit organization formed by political and economic refugees in South Africa, said it has received many reports of arrests and deportations in recent months.

“We have visited Lindela and noted with serious concern that those arrested for deportation include those either attempting to apply for or renewing asylum and refugee status,” said Gabriel Shumba, executive director of Zimbabwe Exiles Forum. “This practice is not only insensitive, but falls afoul of international prescripts.”

    “The current asylum protection system is in crisis and is effectively nonfunctional.”

Ekambaram said the DHA’s actions ignore South Africa’s international and domestic legal commitments. “[South Africa’s] Refugees Act established a parallel legal framework, separate from the Immigration Act, which sets up its own procedures for the detention of asylum seekers and refugees and prohibits their detention as illegal foreigners under the Immigration Act,” she said.

“Yet, despite the existence of a separate legal regime for asylum seekers and refugees, the DHA has applied the Immigration Act, arresting [asylum seekers] as illegal foreigners and subjecting them to arbitrary, indefinite and unlawful detention pending deportation,” she said. “These activities fall outside of the [DHA’s] authority. The current asylum protection system is in crisis and is effectively nonfunctional.”

The DHA declined to respond to questions from Refugees Deeply. Lawyers for Human Rights has submitted written and verbal requests to the DHA for an independent oversight mechanism, in an effort to hold the department accountable.
‘Keep Running’

Sheila initially fled to South Africa from Lupane, Zimbabwe, to escape political persecution. She received death threats from state security agents, she claimed, because of her active role in the Movement for Democratic Change, the main opposition party in Zimbabwe.

When she was deported to Beitbridge, she feared for her life if she returned to Lupane. So she came back to South Africa and now lives undocumented in Johannesburg – too afraid to claim asylum again.

“I want to be legal in South Africa because I cannot keep running away from the police,” she said. “But I am now afraid to go back and apply for new asylum because I might get arrested again. For now, I will have to keep running from the police or bribing them.

“I wish the 2018 elections will bring in a new government and make things normal in Zimbabwe again, so that I can return and live a normal life.”"""

In [17]:
facts = interpreter.process_article_new(content)

In [18]:
facts

[]

In [9]:
document = Document(type=DocumentType.WEB,
                            name="Hurricane Katrina Fast Facts")
analysis = Analysis(document=document, status=Status.NEW)
session.add(analysis)

In [10]:
session.commit()

In [11]:
fact = Fact(unit='person', term='displaced')

In [12]:
session.add(fact)
session.commit()

In [13]:
loc1 = session.query(Location).filter(Location.location_name == 'India').one_or_none()

In [16]:
loc2 = session.query(Location).filter(Location.location_name == 'Pakistan').one_or_none()

In [17]:
loc2

<idetect.model.Location at 0x7f1feb1a2630>

In [18]:
fact.locations.append(loc1)
fact.locations.append(loc2)
analysis.facts.append(fact)
session.commit()

In [21]:
fact = analysis.facts[0]

In [45]:
country_locations = fact.locations

In [46]:
country_locations

[<idetect.model.Location at 0x7f1feb1a2cf8>,
 <idetect.model.Location at 0x7f1feb1a2630>]

In [48]:
country_locations.sort(key=lambda x: x.country.iso3)

In [70]:
country_groups = list(groupby(country_locations, lambda x: x.country.iso3))

In [72]:
country_groups = [(key, [loc for loc in group]) for key, group in groupby(country_locations, lambda x: x.country.iso3)]

In [30]:
fact.locations = []

In [32]:
fact.iso3 = country_groups[0][0]

In [34]:
fact.locations = [location for location in country_groups[0][1]]

In [41]:
list(country_groups[1][1])

[<idetect.model.Location at 0x7f1feb1a2630>]

In [35]:
fact.locations

[]

In [43]:
session.rollback()

In [55]:
df = pd.read_csv("../Data/idmc_facts_content.csv")

In [94]:
df = df[['content_y', 'data']]

In [95]:
df['data'] = df['data'].apply(lambda x: ast.literal_eval(x))
df['term'] = df['data'].apply(lambda x: x['term'])
df['unit'] = df['data'].apply(lambda x: x['unit'])
df['excerpt'] = df['data'].apply(lambda x: x['excerpt'])

In [96]:
df = df[113:].copy()
df.reset_index(inplace=True, drop=True)

In [108]:
def detect_language(s):
    try:
        return detect(s)
    except LangDetectException:
        return 'na'
    
def choose_report(reports):
    '''Choose report based on the heuristics mentioned in the first cell
    '''
    people_reports = []
    household_reports_1 = []
    household_reports_2 = []

    for r in reports:
        if r.reporting_unit == "Person":
            people_reports.append(r)
        elif r.reporting_unit == "Household":
            if r.reporting_term in ("Partially Destroyed Housing", "Uninhabitable Housing"):
                household_reports_2.append(r)
            else:
                household_reports_1.append(r)
    if len(people_reports) > 0:
        report = first_report(people_reports)
    elif len(household_reports_1) > 0:
        report = first_report(household_reports_1)
    elif len(household_reports_2) > 0:
        report = first_report(household_reports_2)
    else:
        report = reports[0]

    return report

def first_report(reports):
    '''Choose the first report based on location in text'''
    report_locs = []
    for report in reports:
        report_locs.append((report, minimum_loc(report.tag_spans)))
    return sorted(report_locs, key=lambda x: x[1])[0][0]

def get_report(reports):
    '''Get reports based on Excerpt and choose the most relevant one'''
    if len(reports) > 0:
        report = choose_report(reports)
        return report.quantity, report.reporting_term, report.reporting_unit, report.locations
    else:
        return 0, '', '', ''
    
def minimum_loc(spans):
    '''Find the first character location in text for each report
    '''
    locs = []
    for s in spans:
        if s['type'] != 'loc':
            locs.append(s['start'])
    return min(locs)

In [99]:
df['lang'] = df['content_y'].apply(lambda x: detect_language(str(x)))

In [100]:
df = df[df['lang'] == 'en']

In [101]:
df['reports'] = df['content_y'].apply(lambda x: interpreter.process_article_new(x))

### See how well we can do for Reporting Unit

In [115]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords
from sklearn.feature_selection import SelectFpr
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter, defaultdict
from textacy.similarity import word2vec, word_movers
import re
import string
import gensim
from numpy import dot 
from numpy.linalg import norm

In [116]:
parser = spacy.en.English()
punctuations = string.punctuation

In [117]:
RANDOM_STATE = 42

In [118]:
train, test = train_test_split(df, test_size = 0.25, random_state=RANDOM_STATE)

### How well does "Rules" approach do?

In [120]:
(test['first_unit'] == test['unit']).sum() / len(test)

0.40192926045016075

In [131]:
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return tokens

In [121]:
# Customer Transformer for creating pipeline of classifiers and predicting probabilities
class ProbaExtractor(TransformerMixin):
    def __init__(self, classifiers):
        self.classifiers = classifiers
        
    def fit(self, *args, **kwargs):
        for cl in self.classifiers:
            cl.fit(*args, **kwargs)
        return self
    
    def transform(self, X, **transform_params):
        return np.concatenate([cl.predict_proba(X) for cl in self.classifiers], axis=1)

In [122]:
# Customer Transformer to extract the sentence embedding as an average of word embeddings (using word2vec)
class MeanEmbeddingVectorizer(TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = (300,)

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([w.vector for w in nlp(sentence)]
                    or [np.zeros(self.dim)], axis=0)
            for sentence in X
        ])

In [123]:
# Customer Transformer to find cosine difference between sentence vector & average true sentence vector
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

class CosDifference(TransformerMixin):
    def __init__(self, nlp, sentences):
        self.nlp = nlp
        self.true_mean = np.mean(np.array([
            np.mean([w.vector for w in nlp(sentence)]
                    or [np.zeros(self.dim)], axis=0)
            for sentence in sentences
        ]), axis=0)
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = (300,)

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array(
            [cosine(np.mean([w.vector for w in nlp(sentence)], axis=0), self.true_mean)
             for sentence in X]).reshape(X.shape[0], 1)

In [124]:
# Transformer for getting distribution of punctuation
class Punctuation(TransformerMixin):
    def __init__(self, punctuations):
        self.punctuations = punctuations
        
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return np.array(
            [
              [sentence.count(p) for p in self.punctuations] for sentence in X
            ])


In [125]:
def capitals_fractions(sentence):
    words = sentence.split(" ")
    num_words = len(words)
    # Total fraction of capitalized words
    one_grams = sum([word.isupper() for word in words])
    bi_grams = 0
    for w1, w2 in zip(words, words[1:]):
        if all(w.isupper() for w in (w1, w2)):
            bi_grams += 1
    tri_grams = 0
    for w1, w2, w3 in zip(words, words[1:], words[2:]):
        if all(w.isupper() for w in (w1, w2, w3)):
            tri_grams += 1
    return np.array([one_grams, bi_grams, tri_grams]) / num_words

class Capitals(TransformerMixin):
    
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return np.array([capitals_fractions(sentence) for sentence in X])


In [127]:
# Transformer for converting from sparse to dense matrix
class Densify(TransformerMixin):
    def fit(self, *args, **kwargs):
        return self
    
    def transform(self, X, **transform_params):
        return X.toarray()

In [163]:
# To do: punctuation & capital letter distribution
def get_features(true_sentences):
    return FeatureUnion([
            ('sentence_vector', MeanEmbeddingVectorizer(nlp)), #Average embedding for sentence
            ('count_vectorizer', CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2))), 
            ('cos_difference', CosDifference(nlp, true_sentences)) #Cosine difference of sentence vector from average true sentence
            #('w2v_difference', ) #W2V difference of sentence from All true sentences
        ])

In [164]:
def artesenal_model(true_sentences):
    model = Pipeline([
            ('feature_union', get_features(true_sentences)),
            ('feature_selection', SelectFpr()),
            ('dense', Densify()),
            ('probas', ProbaExtractor([
                        AdaBoostClassifier(n_estimators=300),
                        ExtraTreesClassifier(n_estimators=300),
                        RandomForestClassifier(n_estimators=400),
                        LogisticRegression(),
                        BaggingClassifier(),
                        KNeighborsClassifier(),
                        GradientBoostingClassifier()
                    ])), #Need to write custom extractor
            ('polynomial', PolynomialFeatures(degree=2)),
            ('classify', LogisticRegression(C=0.5))
        ])
    return model

In [165]:
art_model = artesenal_model(train['content_y'])

In [166]:
art_model.fit(train['content_y'], train['unit'])

Pipeline(memory=None,
     steps=[('feature_union', FeatureUnion(n_jobs=1,
       transformer_list=[('sentence_vector', <__main__.MeanEmbeddingVectorizer object at 0x7f93028e1dd8>), ('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [134]:
test['model_prediction_unit'] = art_model.predict(test['content_y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [135]:
(test['model_prediction_unit'] == test['unit']).sum() / len(test)

0.86173633440514474

In [150]:
# Customer Transformer to extract the sentence embedding as an average of word embeddings (using word2vec)
class HandCraftedRules(TransformerMixin):
    def __init__(self, nlp, session):
        self.interpreter = Interpreter(session, nlp)

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([self.predict_unit(sentence)
            for sentence in X
        ])
    
    def predict_unit(self, sentence):
        reports = self.interpreter.process_article_new(sentence)
        top_report = get_report(reports)
        report_unit = top_report[2]
        return [report_unit == 'Person', report_unit == 'Household', report_unit not in ('Person', 'Household')]

In [151]:
## Update features

In [152]:
# To do: punctuation & capital letter distribution
def get_features(true_sentences):
    return FeatureUnion([
            ('sentence_vector', MeanEmbeddingVectorizer(nlp)), #Average embedding for sentence
            ('count_vectorizer', CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,2))), 
            ('cos_difference', CosDifference(nlp, true_sentences)), #Cosine difference of sentence vector from average true sentence
            ('rules', HandCraftedRules(nlp, session)) #W2V difference of sentence from All true sentences
        ])

In [153]:
art_model = artesenal_model(train['content_y'])

In [154]:
art_model.fit(train['content_y'], train['unit'])

Pipeline(memory=None,
     steps=[('feature_union', FeatureUnion(n_jobs=1,
       transformer_list=[('sentence_vector', <__main__.MeanEmbeddingVectorizer object at 0x7f930292def0>), ('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [155]:
test['model_prediction_unit_2'] = art_model.predict(test['content_y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [156]:
(test['model_prediction_unit_2'] == test['unit']).sum() / len(test)

0.85209003215434087

### Train Linear Model on Both Outputs

In [159]:
from sklearn.linear_model import LogisticRegressionCV

In [160]:
searchCV = LogisticRegressionCV()

In [170]:
train.head(2)

Unnamed: 0,content_y,data,term,unit,excerpt,lang,reports,first_unit,first_term,top_report,model_prediction
943,This Operations Update requests an extension o...,{'excerpt': 'According to the assessment data ...,Displaced,Person,According to the assessment data collected by ...,en,[Locations:Mgomba Unit:Person Term:Displaced Q...,Person,Displaced,"((3000, None), Displaced, Person, [Mgomba])",Person
986,"Apriadi Gunawan, Syofiardi Bachyul Jb and Riza...","{'excerpt': '""The floods submerged 1,606 house...",Uninhabitable Housing,Household,"""The floods submerged 1,606 houses in Kampar, ...",en,[Locations:Lima Unit:Person Term:Displaced Qua...,Person,Displaced,"((None, ), Displaced, Person, [Binjai])",Household


In [169]:
train['model_prediction'] = art_model.predict(train['content_y'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [171]:
final_train = train[['model_prediction', 'first_unit', 'unit']].copy()

In [172]:
final_train_X = final_train[['model_prediction', 'first_unit']]

In [174]:
final_train_X = pd.get_dummies(final_train_X)

In [175]:
searchCV.fit(final_train_X, final_train['unit'])

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [176]:
test.head(1)

Unnamed: 0,content_y,data,term,unit,excerpt,lang,reports,first_unit,first_term,top_report,model_prediction_unit,model_prediction_unit_2
1263,"Share the news ""At the moment the emergency si...","{'excerpt': 'At present, the water was withdra...",Evacuated,Person,"At present, the water was withdrawn from 421 h...",en,"[Locations:Akmola,Aktobe,East Kazakhstan,Kosta...",Person,Evacuated,"((969, None), Evacuated, Person, [Akmola, Akto...",Person,Person


In [178]:
final_test = test[['model_prediction_unit', 'first_unit']].copy()

In [179]:
final_test.columns = ['model_prediction', 'first_unit']

In [180]:
final_test_X = pd.get_dummies(final_test)

In [182]:
test['combined_unit'] = searchCV.predict(final_test_X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [183]:
(test['combined_unit'] == test['unit']).sum() / len(test)

0.86173633440514474

### Naive Combination

In [185]:
def combine_predictions(classifier, rules):
    if classifier == rules:
        return classifier
    elif not rules or rules == '':
        return classifier
    else:
        return rules

In [187]:
test['naive'] = test[['model_prediction_unit', 'first_unit']].apply(lambda x: combine_predictions(x['model_prediction_unit'], x['first_unit']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [188]:
(test['naive'] == test['unit']).sum() / len(test)

0.84565916398713825

In [109]:
df['top_report'] = df['reports'].apply(lambda x: get_report(x))

In [112]:
df['first_unit'] = df['top_report'].apply(lambda x: x[2])
df['first_term'] = df['top_report'].apply(lambda x: x[1])

In [113]:
(df['first_unit'] == df['unit']).sum() / len(df)

0.44292604501607719

In [87]:
interpreter.process_article_new(s)[0]

Locations: Unit:Person Term:Destroyed Housing Quantity:(29, None)

In [74]:
df[df['first_unit'] == 'People']

Unnamed: 0,content_y,data,term,unit,excerpt,lang,reports,first_unit,first_term
10,6 月 20 日からの西日本の大雨に係る被害状況等について ※これは速報であり、数値等は今後...,{'excerpt': 'Households Totally destroyed = 1...,Uninhabitable Housing,Household,Households\r\nTotally destroyed = 14\r\nPartia...,en,[Locations: Unit:People Term:Destroyed Housing...,People,Destroyed Housing
137,,{'excerpt': '25 march - 281 evacuated 31 march...,Evacuated,Person,25 march - 281 evacuated\n31 march - 344 evacu...,en,[Locations: Unit:People Term:Evacuated Quantit...,People,Evacuated
415,"China As of 23 June, 9 million people have bee...",{'excerpt': '100 more reported displaced compa...,Displaced,Person,100 more reported displaced compared to BNPB d...,en,[Locations: Unit:People Term:Displaced Quantit...,People,Displaced
459,"MAOMING, May 21, 2016 (Xinhua) -- Aerial phot...",{'excerpt': 'References to two cities: - Xiny...,Evacuated,Person,"References to two cities:\n\n- Xinyi, a small,...",en,[Locations:Xinyi Unit:People Term:Displaced Qu...,People,Displaced
548,Progress to Date Recovery and Reconstruction f...,{'excerpt': 'As of 14 July 2016 approximately ...,Evacuated,Person,As of 14 July 2016 approximately 148000 people...,en,[Locations: Unit:People Term:Displaced Quantit...,People,Displaced
554,Highlights  Torrential monsoon rains and floo...,{'excerpt': 'The Government confirmed that sin...,Affected,Household,The Government confirmed that since the heavy ...,en,[Locations:Gelephu Unit:People Term:Relocated ...,People,Relocated
560,Emergency Response Coordination Centre (ERCC) ...,{'excerpt': 'Heavy rain has been affecting sev...,Evacuated,Person,Heavy rain has been affecting several areas of...,en,[Locations:Aceh province Unit:People Term:Evac...,People,Evacuated
577,"On Friday morning, a day after tens of thousan...","{'excerpt': 'On Friday morning, a day after te...",Evacuated,Person,"On Friday morning, a day after tens of thousan...",en,"[Locations:Pakistan,Punjab Unit:People Term:Ev...",People,Evacuated
616,"Each day, ERCC publishes a map of the most imp...",{'excerpt': 'Media reported more than 8800 eva...,Evacuated,Person,Media reported more than 8800 evacuated in wes...,en,[Locations: Unit:People Term:Evacuated Quantit...,People,Evacuated
618,state ALABAMA ALASKA ARIZONA ARKANSAS CALIFORN...,{'excerpt': 'All evacuations have been lifted....,Evacuated,Person,"All evacuations have been lifted. 1,000 evacua...",en,[Locations: Unit:People Term:Evacuated Quantit...,People,Evacuated


In [65]:
df['reports']

0       [Locations:China,Liaoning province Unit:Person...
1       [Locations:Benxi City Unit:Person Term:Evacuat...
2       [Locations:Tieling City Unit:Person Term:Reloc...
3       [Locations:Henan,Hebei Unit:Person Term:Displa...
4                                                      []
5       [Locations: Unit:Household Term:Destroyed Hous...
7       [Locations: Unit:Person Term:Evacuated Quantit...
8                                                      []
9                                                      []
10      [Locations: Unit:People Term:Destroyed Housing...
11                                                     []
12      [Locations: Unit:Person Term:Displaced Quantit...
13                                                     []
14                                                     []
15                                                     []
16                                                     []
17                                                     []
18            

In [17]:
facts = interpreter.process_article_new(s)

In [18]:
facts

[Locations:Maungdaw,Buthidaung Unit:Person Term:Displaced Quantity:(None, '')]

In [16]:
s = "Madeleine Kingston/MSF Rohingya refugees set up makeshift shelters, having arrived in Bangladesh. This massive influx, coming on top of 75,000 people who have arrived since violence began in October 2016, represents one of the largest influxes ever of Rohingyas into Bangladesh. Amsterdam - International humanitarian organisations must immediately be granted independent and unfettered access, including for international staff, to alleviate massive humanitarian needs in Rakhine State, Myanmar MSFs call for urgent access comes amid the ongoing military operations in Rakhine, which started on 25 August after a new spate of attacks against police stations and a military base claimed by the Arakan Rohingya Salvation Army (ARSA). As a consequence, more than 400,000 Rohingya have fled to Bangladesh and are living in extremely precarious conditions with limited access to health care, drinking water, latrines and food. The remaining population in Northern Rakhine, thought to be hundreds of thousands of people, is without any meaningful form of humanitarian assistance. Our teams in Bangladesh are hearing alarming stories of severe violence against civilians in Northern Rakhine, says Karline Kleijer, MSF emergency desk manager. Reports says there is significant internal displacement of Rohingya, ethnic Rakhine populations and other minorities. Villages and houses have been burned down, including at least two out of four of MSFs clinics. MSF was providing healthcare services in Maungdaw and Buthidaung townships in Northern Rakhine before they were put on hold due to a lack of travel authorisation and a ban on international staff in mid-August, Ms Kleijer said. We fear that the people remaining there are unable to access the help they may need. Injured, sick or chronically ill people in Northern Rakhine must be accessed without further delay, while emergency healthcare and other humanitarian assistance should be provided. In Central Rakhine, approximately 120,000 internally displaced people remain in camps where they are entirely dependent on humanitarian assistance for their survival, due to severe movement restrictions. MSF used to provide mobile clinics in several camps and villages for displaced people, but international staff have not been granted travel authorisations to visit the health facilities since August, whilst national staff have been too afraid to go to work following remarks by Myanmar officials accusing NGOs of colluding with ARSA. The government-formulated and disseminated accusations against the UN and international NGOs, denial of required travel and activity authorisations, and threatening statements and actions by hardline groups, are all preventing independent humanitarian workers from providing much-needed assistance. Moreover Northern Rakhine has been declared a military zone by the government of Myanmar, resulting in even more severe administrative and access constraints. The government of Myanmar says it wants to exclusively implement the humanitarian response to those affected in Rakhine, sparking fears that aid might not reach those who most need it. Madeleine Kingston/MSF Rohingya who crossed into Bangladesh, fleeing violence in Rakhine state, Myanmar that started on 25 August. This suggests Myanmar is moving towards a new modus operandi putting the delivery of humanitarian aid under the governments exclusive control, which is likely to result in even more severe administrative and access constraints than ever, says Benoit De Gryse, MSFs operations manager for Myanmar. The only way to ensure aid is provided based on needs and is trusted by all populations, is for it to be provided by independent neutral humanitarian actors. MSF is alarmed by the current lack of access to healthcare for those remaining in Rakhine. When it has full access to its clinics, MSF provides over 11,000 primary and reproductive healthcare consultations per month, as well as emergency transport and assistance for patients requiring hospitalisation. All these services are currently on hold, and other agencies have also reported being unable to carry out their activities in Rakhine due to lack of access. To ensure access to medical care and to be able to provide assistance to conflict-affected people, MSF and other international humanitarian agencies must be allowed immediate and unhindered access to all areas of Rakhine State. Without this, there is a very real risk that patients will die unnecessarily, says De Gryse. *MSFs medical projects in other areas of Myanmar; namely Shan and Kachin states and Yangon and Thanintharyi, continue to operate as usual. MSFs regular projects include HIV, tuberculosis, primary healthcare, sexual and reproductive healthcare, emergency referrals to public hospitals and malaria treatment. **MSF has worked in Myanmar for 25 years, working with the Ministry of Health and Sports to provide care for HIV and TB patients, primary healthcare, and vaccinations. In Rakhine State, Myanmar, MSF usually operates mobile clinics providing primary healthcare consultations in a number of villages and displaced population camps, and organising emergency medical referrals to MoHS hospitals. MSF also usually supports the provision of HIV treatment in MoHS hospitals in Northern Rakhine. In Central Rakhine, MSF also usually treats TB patients in cooperation with the National TB programme. Until mid-August, MSF was providing healthcare services in Pauktaw, Sittwe, Ponnagyun, Maungdaw and Buthidaung townships."

In [6]:
from itertools import groupby
from sqlalchemy.orm import object_session
from idetect.geotagger import get_geo_info

Loaded cities_to_countries dictionary.


In [7]:
def save_facts(analysis, facts, session):
    '''Loop through extracted facts and save them to database
    :params article: instance of Article
    :params facts: list of extracted facts
    :params session: session object corresponding to the article
    :return: None
    '''
    for f in facts:
        # First geolocate locations; split into countries and create one fact per country
        country_locations = []
        for location in f.locations:
            country_locations.extend((process_location(location, session)))

        country_locations.sort(key=lambda x: x.country.iso3)
        for key, group in groupby(country_locations, lambda x: x.country.iso3):

            fact = Fact(unit=f.reporting_unit, term=f.reporting_term,
                    excerpt_start=f.sentence_start, excerpt_end=f.sentence_end,
                    specific_reported_figure=f.quantity[0],
                    vague_reported_figure=f.quantity[1], iso3=key,
                    tag_locations=json.dumps(f.tag_spans))
            session.add(fact)
            session.commit()
            print(fact.id)
            analysis.facts.append(fact)
            fact.locations.extend([location for location in group])
            session.commit()


def process_location(location_name, session):
    '''Get geo info for a given location and add the location to database
    :params fact: instance of Fact
    :params location: location name, a String
    :params session: session object corresponding to location
    :return: None
    '''
    locations = []
    location = session.query(Location).filter_by(
        location_name=location_name).one_or_none()
    if location:
        locations.append(location)
    else:
        loc_info = get_geo_info(location_name)
        if loc_info['flag'] != 'no-results':
            country = session.query(Country).filter_by(
                iso3=loc_info['country_code']).one_or_none()
            location = Location(location_name=loc_info['place_name'], location_type=loc_info['type'],
                                country_iso3=country.iso3,
                                country=country, latlong=loc_info['coordinates'])
            session.add(location)
            session.commit()
            locations.append(location)
            session.commit()
    return locations

In [8]:
nlp = spacy.load("en_default")

In [4]:
session = Session()

In [10]:
class Worker:
    
    def __init__(self, filter_function):
        self.filter_function = filter_function
        
    def work(self):
        return self.filter_function(session.query(Analysis)).with_for_update().order_by(Analysis.updated).first()

In [73]:
w = Worker(scraping_filter)

In [74]:
a = w.work()

In [75]:
a

In [None]:
Status.SCRAPING_FAILED

In [72]:
def scraping_filter(query):
    return query.filter((Analysis.status == Status.NEW) | 
                       ((Analysis.status == Status.SCRAPING_FAILED) & 
                        (Analysis.retrieval_attempts < 3) &
                        (func.now() > Analysis.retrieval_date + timedelta(hours=12))))

In [71]:
a.retrieval_attempts

1

In [9]:
scraping_filter(session.query(Analysis)).with_for_update().order_by(Analysis.updated).first()

<idetect.model.Analysis at 0x7fbbbb863a58>

In [None]:
session.query(Analysis) \
                .with_for_update() \
                .filter(Analysis.status == self.status) \
                .order_by(Analysis.updated) \
                .first()

In [45]:
analysis.retrieval_date.

psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)

In [34]:
func.date_subtract(func.now(), analysis.retrieval_date)

<sqlalchemy.sql.functions.Function at 0x7fbbbb67bf60; date_subtract>

In [56]:
(datetime.datetime.now(psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)) - analysis.retrieval_date).total_seconds()

104071.959805

In [62]:
analysis.retrieval_date + timedelta(hours=12)

datetime.datetime(2017, 9, 7, 6, 48, 44, 308395, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None))

In [29]:
f.between?

In [23]:
doc = session.query(Document).filter(Document.url == 'http://reliefweb.int/report/south-sudan/humanitarian-aid-reaches-desperate-internally-displaced-people-bangasu').all()[0]

In [24]:
analysis = session.query(Analysis).get(doc.id)

In [14]:
session = object_session(analysis)

In [15]:
interpreter = Interpreter(session, nlp)

In [16]:
content = analysis.content.content

In [17]:
facts = interpreter.process_article_new(content)

In [20]:
save_facts(analysis, facts, session)

In [23]:
f = facts[0]

In [24]:
country_locations = []
for location in f.locations:
    country_locations.extend((process_location(location, session)))

In [25]:
country_locations

[]

In [26]:
country_locations.sort(key=lambda x: x.country.iso3)

In [27]:
country_locations

[]

In [28]:
groupby(country_locations, lambda x: x.country.iso3)

<itertools.groupby at 0x7f8562ebc048>

In [21]:
a.facts

[]

In [9]:
a.document_id

3

In [10]:
extract_facts(a)

Loaded Spacy English Language NLP Models.
2


In [5]:
c_m = CategoryModel()

In [21]:
r_m = RelevanceModel()

In [24]:
test_content = """PHILLIP MBUGO Hundreds of people sheltering from violence at Bangasu camp in the Western Equatorian region of South Sudan have received humanitarian relief for the first time since they fled their homes in June. A peacekeeping patrol serving with the United Nations Mission in South Sudan provided safe passage for the delivery of aid by humanitarian agencies to more than 800 households at two camps, who received cooking utensils, buckets and tarpaulins to provide much-needed shelter from the heavy rains. The journey to these camps involves the convoy navigating waterlogged roads damaged by heavy rain and the war that has ravaged South Sudan for almost four years. The convoy passed by villages abandoned by residents who fled after coming under attack earlier this year. No one knows exactly how many people died in the raids but many of the homes have been looted or burnt to the ground. Those sheltering at Bangasu camp say both sides involved in the ongoing conflict are responsible for their predicament. “It is difficult because as the incident happened the government was patrolling these places searching for the rebels and the rebels were also active here moving through the villages,” said Bangasu camp chief, Moses Ruzino. The patrol was also able to reach Rimenze camp further to the north where hundreds of people have sought sanctuary in a makeshift camp next to the Catholic Church. Despite the presence of the church, the families here still suffer from regular threats, harassment, beatings and looting by armed groups. “Of course we are scared because there is no protection, you know, anyone can just come and enter the camp, as has happened several times back, that the rebels can come in the camp beginning to beat people and to loot from people,” said Moses Ruzino. “That is why we are putting our request to the UN come here to protect the civilians here.” The peacekeeping mission has two purposes. Firstly, to provide a protective presence in the area, even if it is temporary, and, secondly, to facilitate the delivery of aid by humanitarian agencies who have been struggling to safely reach the displaced civilians. Life is difficult in these camps without access to clean drinking water, food, or adequate shelter. “It is really very difficult for us to survive in this place because even when we run, all our food items were looted, and all our food items are now spoiled. We are just staying, waiting, to see who can help us by providing us a little food so we can help our children and women,” said Bangasu camp chief, James Atoroba. Bangasu camp resident, Atonita Daniel, is particularly grateful for the assistance. Her husband was killed during violent clashes in the area in 2015. Since then, she has been raising her nine children on her own. The simple gift of a plastic sheet will make a huge difference in protecting them from the heavy rain. “I have experienced hardship for the first time in my life with my nine children whose father was killed on his way to sell charcoal in the town and left me alone. The children and I are telling those who are fighting to stop senseless war and I also urge UN to bring peace to South Sudan at any cost. We are tired,” she said. The people in these camps want to go home to grow their crops, raise their children and live a peaceful and prosperous life. However, it is simply not safe enough to return. They say there are armed groups in the bush who continue to loot the deserted homes and military forces often follow them, harassing and beating them, in an attempt to find out where armed opposition groups are located. “My message to the government of South Sudan and also the international community is that they have to bring peace so that we can go back to our own places so that our children can go to school. We need peace,” said Moses Ruzino in Rimenze. That simple plea echoed by those back down the long road to Bangusa. “We don’t like to stay just the way we are, squeezing ourselves here in the camp. We need protection because we want to go back to our local area,” said James Atoroba. “We don’t like war. We want all the gunshots to stop in our area. We do not want to see any rebels moving with their guns anywhere or looting people’s property. We need peace.” Peace so that the people of Bangusa and Rimenze can live their lives safely, with dignity, and hope for a brighter future."""

In [25]:
c_m.predict(test_content)

  nnz = np.nonzero(abs(vec) > eps)[0]


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [11]:
content = analysis.content.content

In [12]:
c_m.model

Pipeline(memory=None,
     steps=[('tokenizer', <idetect.nlp_models.category.Tokenizer object at 0x7f30db9c6b70>), ('lsi', <idetect.nlp_models.category.LsiTransformer object at 0x7f30db95c978>), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=15, m...n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [13]:
c_m.model.steps[0]

('tokenizer', <idetect.nlp_models.category.Tokenizer at 0x7f30db9c6b70>)

In [14]:
tokenizer = c_m.model.steps[0][1]

In [15]:
pd.Series(content)

0    PHILLIP MBUGO Hundreds of people sheltering fr...
dtype: object

In [16]:
X = tokenizer.transform(pd.Series(content))

In [17]:
X

0    [phillip, mbugo, hundreds, people, sheltering,...
dtype: object

In [18]:
lsi = c_m.model.steps[1][1]

In [21]:
X2 = lsi.transform(X)

  nnz = np.nonzero(abs(vec) > eps)[0]


In [20]:
model = c_m.model.steps[2][1]

In [23]:
analysis.content.content

'PHILLIP MBUGO Hundreds of people sheltering from violence at Bangasu camp in the Western Equatorian region of South Sudan have received humanitarian relief for the first time since they fled their homes in June. A peacekeeping patrol serving with the United Nations Mission in South Sudan provided safe passage for the delivery of aid by humanitarian agencies to more than 800 households at two camps, who received cooking utensils, buckets and tarpaulins to provide much-needed shelter from the heavy rains. The journey to these camps involves the convoy navigating waterlogged roads damaged by heavy rain and the war that has ravaged South Sudan for almost four years. The convoy passed by villages abandoned by residents who fled after coming under attack earlier this year. No one knows exactly how many people died in the raids but many of the homes have been looted or burnt to the ground. Those sheltering at Bangasu camp say both sides involved in the ongoing conflict are responsible for th

In [26]:
X2

array([[  3.98533153e-002,  -6.78877588e-003,   1.23977107e+167,
          4.47658617e-003,  -2.15503716e-003,  -6.31532784e-003,
         -8.88269341e-004,  -1.92956093e-003,  -2.79438626e-003,
         -1.56233996e-003,   2.96268319e-003,  -1.83742972e-002,
          5.21991089e-003,  -4.02705539e-003,   1.36487319e+086,
         -2.70596899e-003,  -5.95390594e-003,  -7.01046361e-003,
          1.31113378e-003,  -4.49902612e-003,  -9.96014576e+096,
         -1.25009136e-002,  -1.20032794e-003,   9.05436818e-003,
          7.23823293e-003,  -6.87350144e-003,  -1.60927478e+172,
          4.19116802e-003,   6.02482894e-003,  -5.07727521e-003,
         -3.82248431e-003,  -1.01225580e-002,   2.66312815e-003,
          8.92615397e-004,   9.88045987e-003,   8.48650050e-004,
          3.68830439e-003,   3.35289453e-003,  -7.61944075e+064,
         -1.75556433e-004,   7.67359716e-004,   3.88216782e-003,
          2.71741342e-003,   4.01165608e-003,   9.60059148e+089,
         -3.11644623e-003

In [22]:
model.predict(X2)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [30]:
c_m.predict(test_content)

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.

In [25]:
r_m.predict(content)

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.

In [24]:
c_m.predict(content)

ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.

In [27]:
type(content)

str

In [28]:
len(content)

4386

In [19]:
a.content.content

'PHILLIP MBUGO Hundreds of people sheltering from violence at Bangasu camp in the Western Equatorian region of South Sudan have received humanitarian relief for the first time since they fled their homes in June. A peacekeeping patrol serving with the United Nations Mission in South Sudan provided safe passage for the delivery of aid by humanitarian agencies to more than 800 households at two camps, who received cooking utensils, buckets and tarpaulins to provide much-needed shelter from the heavy rains. The journey to these camps involves the convoy navigating waterlogged roads damaged by heavy rain and the war that has ravaged South Sudan for almost four years. The convoy passed by villages abandoned by residents who fled after coming under attack earlier this year. No one knows exactly how many people died in the raids but many of the homes have been looted or burnt to the ground. Those sheltering at Bangasu camp say both sides involved in the ongoing conflict are responsible for th

In [2]:
from sqlalchemy import create_engine
from idetect.model import db_url, Base, Document, Session, Status, Analysis, \
    DocumentContent, Location, Country, Fact, KeywordType, FactKeyword    

# connect to the DB specified in the docker.env file
engine = create_engine(db_url())
Session.configure(bind=engine)

In [3]:
from idetect.fact_extractor import extract_facts

Loaded cities_to_countries dictionary.
Loaded Spacy English Language NLP Models.


In [4]:
from idetect.load_data import load_countries, load_terms

In [5]:
# create the DB schema, if it doesn't already exist
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
session=Session()

In [6]:
load_countries(session)

In [7]:
load_terms(session)

In [20]:
session.rollback()

In [8]:
c = 'Situation Overview  Monsoon rains and increased water levels in major rivers have caused seasonal floods in Myanmar, primarily affecting Magway, Sagaing, Bago and Ayeyarwady regions and Mon State. Mandalay Region, Chin, Kayin, Rakhine and Shan states, as well as the Union Territory, are also affected by floods but with smaller-scale or no displacements reported at this stage. As of 25 July, the cumulative number of people who had been temporarily displaced/evacuated since the beginning of July in flood-affected states and regions was reported to be almost 140,000, according to the Government of Myanmar’s Relief and Resettlement Department (RRD) in Nay Pyi Taw. At least three people are reported to have died due to flooding. In some areas flood waters have receded and people have returned to their homes. As of 25 July, out of a total of 269 evacuation sites, 107 had been closed again, mainly in Sagaing, Mon, Rakhine and the Union Territory. In addition to those displaced/evacuated between 2 and 25 July, thousands more have been affected by flooding. People who are displaced are staying in monasteries, in temporary shelters or with host families or relatives.  Most severely affected states/regions (displacement figures as of 25 July)  Magway remains the most severely affected region with over 92,000 people reported to have been temporarily displaced across 12 townships, with Pokkoku, Yesagyo and Yenanchaung townships the worst affected, according to RRD. Flood waters in some of these townships are reported to be receding and people returning to their homes. However, the Magway regional RRD reports that many of those most severely affected live along the banks of the Ayayerwady River and may be displaced multiple times as flooding is likely to reoccur in these areas over the course of the monsoon season. Some families, who live in low lying areas, remain displaced as flooding continues to affect their homes. As of 25 July, a total of 55 evacuation sites in Magway remained open. In Sagaing Region, more than 23,000 people were temporarily displaced. However, all temporary evacuation sites in the region are now reported to have been closed and most of those displaced are reported to have returned to their homes after flood waters receded, according to RRD in Sagaing.  In Bago Region, over 10,000 people from eight townships have been temporarily displaced, while more than 6,700 people in nine townships in the Ayeyarwady Region are being accommodated in evacuation centres or staying with host families. In Mon State, some 5,000 people were displaced by floods over the past week but many have since been able to return to their homes. Floods also temporarily displaced over 1,100 people in Rakhine State and strong winds, heavy rain and swollen rivers have destroyed 75 houses and damaged another 120 homes in Mrauk-U, Minbya, Kyauktaw, Toungup and Ann townships since 24 July. Some of those displaced have since been able to return to their homes. In Mandalay Region, thousands of people affected by rising water levels have reportedly moved to safer locations on their own accord as a preventive measure.  Response  The Government of Myanmar is leading the flood response. RRD, in coordination with local authorities and different government departments, is responding to the immediate needs of displaced families by providing cash to purchase rice rations and distributing drinking water and relief items. Assistance is being provided in areas recently affected by floods, including Rakhine, Ayayerwady and Mon. The Union Minister of the Department of Social Welfare, Relief and Resettlement Department visited Kyaikhto Township in Mon State on 23 July and provided immediate assistance to flood affected/displaced households, as well fiber boats and life jackets to the township authorities. RRD is also providing cash to repair and rebuild houses that were damaged or washed away. Meanwhile, RRD has pre-positioned emergency tarpaulins and water purification tablets in Ayeyarwady, Bago and Magway regions. The Myanmar Military is also providing assistance, including food, drinking water, warm clothes and blankets, mobile health clinics, as well as supporting flood preparedness measures, evacuations and clean up in a number of flood-affected areas. In collaboration with the Township Health Department, local authorities are organizing chlorination of water sources and pond cleaning/renovation in areas where flood waters have subsided.  The Myanmar Red Cross Society (MRCS) is providing evacuation services in affected townships, assisting with water and sanitation in evacuation sites. MRCS is also providing food, drinking water, emergency tarpaulins and shelter tool kits, hygiene kits and mosquito nets, as well as first aid and emergency healthcare services. INGOs have provided water and sanitation support, including chlorination of water sources and distributing water purification tablets in Sagaing Region. UN agencies continue to closely monitor the situation and remain in contact with local and national authorities and stand ready to support the Government’s Response, if required.  Needs/Gaps: In Magway Region, as some people from low-lying areas remain displaced in temporary evacuation sites and some people may be temporarily displaced again during the coming weeks as the monsoon season continues, there may be a need for additional assistance, particularly for food, tarpaulins and water and sanitation support, according to the regional RRD. Local authorities and RRD have provided initial assistance and are closely monitoring the situation. The UN and MRCS are also following the situation and liaising closely with the regional authorities to respond to further humanitarian needs, if required.  Weather Outlook  The Government of Myanmar’s Department of Meteorology and Hydrology (DMH) has issued flood warnings for a few townships in Ayeyarwady and Bago regions as the water levels of rivers have reached above danger levels and advised people who live in low lying areas/close to rivers to take precautionary measures. DMH reported on 27 July that rains or thundershowers will be widespread in all states and regions with increased rains in Tanintharyi Region, as well as in Kayin and Mon states.'

In [9]:
content = DocumentContent(content=c)
session.add(content)
session.commit()

In [10]:
document = Document(name="Name", type="WEB")
session.add(document)
session.commit()

In [11]:
analysis = Analysis(status="Status", document_id=document.id, content_id=content.id)
session.add(analysis)
session.commit()

In [12]:
extract_facts(analysis)

In [19]:
[(f.iso3, f.term, f.unit, f.specific_reported_figure, f.vague_reported_figure) for f in analysis.facts]

[('MMR', 'Destroyed Housing', 'Household', 75, None),
 ('MMR', 'Displaced', 'Person', 5000, None),
 ('MMR', 'Displaced', 'Household', None, ''),
 ('MMR', 'Displaced', 'Household', None, ''),
 ('MMR', 'Displaced', 'Household', None, ''),
 ('MMR', 'Partially Destroyed Housing', 'Household', 75, None),
 ('IND', 'Displaced', 'People', None, 'thousands'),
 ('MMR', 'Displaced', 'People', None, 'thousands'),
 ('MMR', 'Displaced', 'Person', None, ''),
 ('MMR', 'Displaced', 'Person', 23000, None),
 ('MMR', 'Sheltered', 'Person', 6700, None),
 ('MMR', 'Displaced', 'Household', None, ''),
 ('MMR', 'Displaced', 'Household', None, ''),
 ('MMR', 'Displaced', 'Household', None, ''),
 ('MMR', 'Displaced', 'Household', None, ''),
 ('MMR', 'Displaced', 'Person', None, '')]

In [8]:
from idetect.interpreter import Interpreter

In [9]:
nlp = spacy.load("en_default")

In [14]:
interpreter2 = Interpreter(session, nlp)

In [15]:
interpreter2.person_unit_lemmas

['family',
 'person',
 'people',
 'individual',
 'local',
 'villager',
 'resident',
 'occupant',
 'citizen',
 'household']

In [10]:
[t.description for t in session.query(ReportKeyword).filter_by(term_type=TermType.PERSON_TERM).all()]

['displaced',
 'evacuated',
 'forced',
 'flee',
 'homeless',
 'relief camp',
 'sheltered',
 'relocated',
 'stranded',
 'stuck',
 'accommodated']

In [4]:
#import datetime
#from idetect.scraper import scrape_article
from idetect.fact_extractor import extract_reports, save_reports, process_location
#from idetect.geotagger import get_geo_info
#import newspaper

Loaded cities_to_countries dictionary.
Loaded Spacy English Language NLP Models.


In [4]:
from idetect.geotagger import get_geo_info

In [5]:
session=Session()

In [6]:
session.execute("DROP TABLE article CASCADE;")

<sqlalchemy.engine.result.ResultProxy at 0x7f541b087550>

In [None]:
# create the DB schema, if it doesn't already exist
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
session=Session()

In [2]:
db_url

NameError: name 'db_url' is not defined

In [5]:
session = Session()

In [6]:
article = Article(url='http://www.jupiter.fl.us/documentcenter/view/3573')
session.add(article)
session.commit()

In [8]:
content = Content(content=c, article_id=article.id)
session.add(content)
session.commit()

In [10]:
load_countries(session)

In [9]:
%lprun extract_reports(article)

ERROR:root:Line magic function `%lprun` not found.


In [10]:
%lprun?

Object `%lprun` not found.


In [17]:
%prun get_geo_info('Sagaing')

 

In [20]:
def strip_accents(s):
    '''Strip out accents from text'''
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [22]:
%prun strip_accents("Sagaing")

 

In [15]:
article.reports[0].locations[0].description

'Sagaing'

In [None]:
extract_reportsn extract_geo

In [12]:
report = session.query(Report).all()[0]

In [32]:
location = reports[0].locations[1]

In [33]:
loc = session.query(Location).filter_by(description=location).one_or_none()

In [34]:
loc

<idetect.model.Location at 0x7fb002c02c18>

In [27]:
loc_info = get_geo_info(location)

In [29]:
if loc_info['flag'] != 'no-results':
    country = session.query(Country).filter_by(code=loc_info['country_code']).one_or_none()
    location = Location(description=loc_info['place_name'], location_type=loc_info['type'], country_code=country.code,
            country=country, latlong=loc_info['coordinates'])
    session.add(location)
    session.commit()
    report.locations.append(location)

In [30]:
report.locations

[<idetect.model.Location at 0x7fb002c02c18>]

In [4]:
from idetect.fact_extractor import extract_reports

Loaded cities_to_countries dictionary.
Loaded Spacy English Language NLP Models.


In [5]:
article = Article(url="some_url")
session.add(article)
session.commit()

In [6]:
article = session.query(Article).first()

In [7]:
text = """Evacuations prompted by wildfires in central B.C. are approaching record levels for the province, officials said Tuesday.

The total number of people displaced by the fires is now estimated at 45,806, according to Robert Turner, deputy minister for Emergency Management B.C.

"This number has changed fairly significantly in the past 48 hours and that has to do primarily with getting better information from local governments," Turner said.

"It's becoming one of the largest displacement events in the history of the province."""

In [8]:
content = Content(content=text)
session.add(content)

In [9]:
article.content.append(content)
session.commit()

In [10]:
article.content

[<idetect.model.Content at 0x7f353487ada0>]

In [11]:
extract_reports(article)

NameError: name 'session' is not defined