In [6]:
from datetime import datetime,timedelta
from nltk.corpus import stopwords
import pandas as pd
import spacy

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from collections import defaultdict
import itertools

In [13]:
list_stopWords=list(set(stopwords.words('english')))

In [14]:
nlp = spacy.load('en_core_web_md')

In [15]:
df_demo = pd.read_csv('demo.csv',index_col=0)

In [16]:
df_demo.head()

Unnamed: 0,class,date,description,title
0,Disasters and accidents,2017/9/1,"Hurricane Irma, now a Category 2 hurricane wit...",2017 Atlantic hurricane season
1,Health and medicine,2017/9/1,"Researchers report, in the Environmental Scien...",Great Lakes
2,International relations,2017/9/1,South Korean President Moon Jae-in and U.S. Pr...,South Korea鈥揢nited States relations
3,Law and crime,2017/9/1,United States federal judge Richard Posner has...,United States Court of Appeals for the Seventh...
4,Politics and elections,2017/9/1,"The Labour Party, led by Jacinda Ardern, surge...","New Zealand general election, 2017"


In [17]:
def class_code(type_str):
    type_str = type_str.lower()
    if 'armed' in type_str or 'attack' in type_str or 'conflict' in type_str:
        return 1
    elif 'disaster' in type_str or 'accident' in type_str:
        return 2
    elif 'law' in type_str or 'crime' in type_str:
        return 3
    elif 'politic' in type_str or 'election' in type_str:
        return 4
    elif 'international' in type_str or 'relation' in type_str:
        return 5
    elif 'science' in type_str or 'technology' in type_str:
        return 6
    elif 'business' in type_str or 'econom' in type_str:
        return 7
    elif 'art' in type_str or 'culture' in type_str:
        return 8
    elif 'sport' in type_str:
        return 9
    elif 'health' in type_str or 'environment' in type_str:
        return 10
    else:
        return 0

In [18]:
df_demo['class_code'] = df_demo['class'].apply(class_code)

In [19]:
df_demo.head()

Unnamed: 0,class,date,description,title,class_code
0,Disasters and accidents,2017/9/1,"Hurricane Irma, now a Category 2 hurricane wit...",2017 Atlantic hurricane season,2
1,Health and medicine,2017/9/1,"Researchers report, in the Environmental Scien...",Great Lakes,10
2,International relations,2017/9/1,South Korean President Moon Jae-in and U.S. Pr...,South Korea鈥揢nited States relations,5
3,Law and crime,2017/9/1,United States federal judge Richard Posner has...,United States Court of Appeals for the Seventh...,3
4,Politics and elections,2017/9/1,"The Labour Party, led by Jacinda Ardern, surge...","New Zealand general election, 2017",4


In [20]:
def description_clean(description):
    description = description.split('. (')[0]+'.'
    return description

In [21]:
df_demo['des_clean'] = df_demo['description'].apply(description_clean)

In [22]:
def efitf(X):
    count = CountVectorizer(stop_words='english')
    X_train_count = count.fit_transform(X)
    tfidf = TfidfTransformer(use_idf=True,smooth_idf=True,sublinear_tf=True)
    X_train_tfidf = tfidf.fit_transform(X_train_count)
    tf_feature_names = count.get_feature_names()
    X_train_tfidf = [list(i) for i in list(X_train_tfidf.toarray())]
    EFITF = defaultdict(dict)
    for Type,values in enumerate(X_train_tfidf):
        for index,value in enumerate(values):
            if value > 0.0:
                EFITF[Type].update({tf_feature_names[index]:value}) 
    return EFITF

In [23]:
X = []

In [24]:
X = df_demo['des_clean'].tolist()

In [25]:
EFITF = efitf(X)

In [26]:
def class_similarity(class_text,span):
    return nlp(class_text).similarity(nlp(span))

In [33]:
def get_query(doc,class_text,doc_index,doc_date):
    date_1 = (datetime.strptime(doc_date,'%Y/%m/%d')+timedelta(days=1)).strftime('%Y-%m-%d')
    date_0 = datetime.strptime(doc_date,'%Y/%m/%d').strftime('%Y-%m-%d')
    doc = nlp(doc)
    kws = []
    for i in doc.ents:
        kws.append(i.text)
    triggers = []
    for token in doc:
        if not token.is_stop and token.tag_.startswith('V'):
            if token.text.lower() in EFITF[doc_index].keys():
                triggers.append((token.text,token.tag_,str(class_similarity(class_text,token.text)+EFITF[doc_index][token.text.lower()])))
    triggers = sorted(triggers,key=lambda x:x[2],reverse=True)[:3]
    for i in triggers:
        kws.append(i[0])
    noun_chunks = []
    for i in doc.noun_chunks:
        noun_chunks.append((i.text,str(class_similarity(class_text,i.text))))
    kws.append(sorted(noun_chunks,key=lambda x:x[1],reverse=True)[0][0].split(' ')[-1])
    kws = [w for w in kws if not w in list_stopWords]
    kws = list(set(kws))
    query = [i for i in itertools.combinations(kws,2)]
    query = ['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'until:'+date_0 for i in query]+['"'+i[0]+'"'+' '+'"'+i[1]+'"'+' '+'until:'+date_1 for i in query]
    return query

In [36]:
for event in df_demo.iterrows():
    doc_index = event[0]
    doc_date = event[1]['date']
    doc_class = event[1]['class']
    doc_title = event[1]['title']
    doc = event[1]['des_clean']
    class_text = doc_class.replace('and','')
    query = get_query(doc,class_text,doc_index,doc_date)
    print(len(query),'\n',query)

132 
 ['"this weekend" "strengthening" until:2017-09-01', '"this weekend" "the Leeward Islands" until:2017-09-01', '"this weekend" "175 kilometers per hour" until:2017-09-01', '"this weekend" "expected" until:2017-09-01', '"this weekend" "danger" until:2017-09-01', '"this weekend" "Hurricane Irma" until:2017-09-01', '"this weekend" "Category 2" until:2017-09-01', '"this weekend" "Caribbean" until:2017-09-01', '"this weekend" "next Thursday" until:2017-09-01', '"this weekend" "increasing" until:2017-09-01', '"this weekend" "110 miles per hour" until:2017-09-01', '"strengthening" "the Leeward Islands" until:2017-09-01', '"strengthening" "175 kilometers per hour" until:2017-09-01', '"strengthening" "expected" until:2017-09-01', '"strengthening" "danger" until:2017-09-01', '"strengthening" "Hurricane Irma" until:2017-09-01', '"strengthening" "Category 2" until:2017-09-01', '"strengthening" "Caribbean" until:2017-09-01', '"strengthening" "next Thursday" until:2017-09-01', '"strengthening" "