# Model Preparation 1

In [2]:
import gensim
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer
import sklearn
import operator
#import matplotlib
from sklearn.externals import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

pd.set_option('display.max_colwidth', -1)

In [4]:
#import seaborn as sns
#import matplotlib.pyplot as plt
#%matplotlib inline

In [5]:
with open('../data/stop_words_en_long.txt', 'r') as f:
    stop_words = f.read()
stop_words = stop_words.split('\n')

In [6]:
df = pd.read_csv('../data/classification_training.csv')
df.drop(['url', 'Unnamed: 0', 'Unnamed: 0.1', 'langs'], axis=1, inplace=True)

In [7]:
def prepare_stems(text, stop_words):
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    stemmer = nltk.stem.PorterStemmer()
    
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [t for t in tokens if t not in stop_words] 
    stems = [stemmer.stem(t) for t in tokens]
    stems = [s.lower() for s in stems]
    #stems = [s for s in stems if not s.isdigit()]
    
    return stems

In [8]:
def prepare_tokens(text, stop_words):
    tokenizer = nltk.tokenize.WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [t for t in tokens if t not in stop_words] 
    tokens = [t.lower() for t in tokens]
    tokens = [t for t in tokens if not t.isdigit()]
    return tokens

In [9]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(df['text'], 
                                                                            df['category'], 
                                                                            test_size=0.2)
# X_train_stems, X_test_stems, y_train_stems, y_test_stems = sklearn.model_selection.train_test_split(df['stems'], 
#                                                                                                     df['category'], 
#                                                                                                     test_size=0.2)

## Classification

## Key Word Estimator

The keyword method is wrapped in an `scikit-learn` estimator. Although it does not need to be fitted, this allows us to call `predict`.

In [10]:
class Stemmer(TransformerMixin):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = PorterStemmer()
        
    def prepare_stems(self, text, stop_words):
        tokens = self.tokenizer.tokenize(text)
        tokens = [t for t in tokens if len(t) > 2]
        tokens = [t for t in tokens if t not in stop_words] 
        stems = [self.stemmer.stem(t) for t in tokens]
        stems = [s.lower() for s in stems]
        stems = [s for s in stems if not s.isdigit()]
        return stems
    
    def fit(self, X, *args):
        return self
    
    def transform(self, X, *args):
        X = X.map(lambda x: self.prepare_stems(x, self.stop_words))
        return X

In [11]:
class Tokenizer(TransformerMixin):
    def __init__(self, stop_words=None):
        self.stop_words = stop_words
        self.tokenizer = WordPunctTokenizer()
        
    def prepare_tokens(self, text, stop_words):
        tokens = self.tokenizer.tokenize(text)
        tokens = [t for t in tokens if len(t) > 2]
        tokens = [t for t in tokens if t not in stop_words]
        tokens = [t.lower() for t in tokens]
        tokens = [t for t in tokens if not t.isdigit()]
        return tokens
    
    def fit(self, X, *args):
        return self
    
    def transform(self, X, *args):
        X = X.map(lambda x: self.prepare_tokens(x, self.stop_words))
        return X

In [12]:
class KeyWordClassifier(BaseEstimator):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.create_keywords()
    
    def create_keywords(self):
        conflict_tokens = ['war', 'conflict', 'military', 'ceasefire', 'terrorism', 'fighting', 'militia', 'rebels', 
                  'violence', 'violent', 'clash', 'insurgent', 'besiege', 'bomb', 'gun', 'combat', 'siege',
                  'battle', 'battleground', 'explode', 'explosive', 'peace', 'truce', 'airstrike', 'armed', 'weapon']
        self.conflict_stems = [self.stemmer.stem(token) for token in conflict_tokens]
        disaster_tokens = ['flood', 'wildfire', 'fire', 'earthquake', 'mudslide', 'landslide', 'washed', 'hurricane',
                          'storm', 'rain', 'rainfall', 'river', 'sea', 'disaster', 'volcano', 'typhoon', 'blaze',
                         'tremor', 'drought', 'disease', 'malnutrition', 'virus', 'health', 'tornado', 'forest', 'snow']
        self.disaster_stems = [self.stemmer.stem(token) for token in disaster_tokens]
        return self
    
    def tag_by_stem(self, texts, conflict_stems, disaster_stems):
        equals = []
        categories = []
        tag_dicts = []
        for text in texts:
            tag_dictionary = {'conflict': 0, 'disaster': 0}
            for stem in conflict_stems:
                tag_dictionary['conflict'] = tag_dictionary['conflict'] + text.count(stem)
            for stem in disaster_stems:
                tag_dictionary['disaster'] = tag_dictionary['disaster'] + text.count(stem)

            if tag_dictionary['conflict'] == 0 and tag_dictionary['disaster'] == 0:
                category = 'other'
                e = True
            elif tag_dictionary['conflict'] == tag_dictionary['disaster']:
                category = 'other'
                e = True
            else:
                category = max(tag_dictionary, key=tag_dictionary.get)
                e = False
            categories.append(category)
            tag_dicts.append(tag_dictionary)
            equals.append(e)
        return categories
    
    def fit(self, *args):
        return self
    
    def transform(self, X, *args):
        y = self.tag_by_stem(X, self.conflict_stems, self.disaster_stems)
        return y
    
    def predict(self, X, y=None):
        y = self.tag_by_stem(X, self.conflict_stems, self.disaster_stems)
        return y

In [13]:
kw_pipeline = Pipeline([
        ('stemmer', Stemmer(stop_words=stop_words)),
        ('kw_clf', KeyWordClassifier())
    ])

In [14]:
pred = kw_pipeline.predict(X_test)

In [15]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

   conflict       0.80      0.68      0.74       149
   disaster       0.91      0.91      0.91       357
      other       0.60      0.68      0.64       144

avg / total       0.81      0.81      0.81       650



## Tf-Idf Transformer

The first step in our machine learning process is tf-idf. This transformer wraps the `gensim` implementation in `sklearn`'s API.

`fit` creates a dictionary, transforms the corpus using a bag of words approach, and then builds the tfidf model based on this. `transform` then applies this model to any collection of documents.

In [16]:
class TfidfTransformer(TransformerMixin):
    def __init__(self, no_below=5, no_above=0.5, tfidf_model=None, dictionary=None):
        self.dictionary = dictionary
        self.tfidf_model = tfidf_model
        self.no_below = no_below
        self.no_above = no_above
    
    def make_dictionary(self, texts):
        if not self.dictionary:
            self.dictionary = gensim.corpora.Dictionary(texts)
            if self.no_below or self.no_above:
                self.dictionary.filter_extremes(no_below=self.no_below, no_above=self.no_above)
        return self
    
    def tfidf_transform(texts, dictionary=None, tfidf_model=None):
        if not dictionary:
            dictionary = gensim.corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        if not tfidf_model:
            tfidf_model = gensim.models.TfidfModel(corpus)
        corpus_tfidf = tfidf_model[corpus]
        return corpus_tfidf, dictionary, tfidf_model
    
    def make_corpus(self, texts):
        corpus = [self.dictionary.doc2bow(text) for text in texts]
        return corpus
    
    def make_tfidf_model(self, corpus):
        if not self.tfidf_model:
            self.tfidf_model = gensim.models.TfidfModel(corpus)
        return self
    
    def fit(self, texts, y=None):
        self.make_dictionary(texts)
        self.corpus = self.make_corpus(texts)
        self.make_tfidf_model(self.corpus)
        
    def transform(self, texts):
        corpus = self.make_corpus(texts)
        return self.tfidf_model[corpus]

## LSI Transformer

In [17]:
class LsiTransformer(TransformerMixin):
    def __init__(self, n_dimensions=100, no_below=5, no_above=0.5, lsi_model=None):
        self.lsi_model = lsi_model
        self.n_dimensions = n_dimensions
        self.no_below = no_below
        self.no_above = no_above
    
    def build_tfidf(self, texts):
        self.tfidf_transformer = TfidfTransformer(no_below=self.no_below, no_above=self.no_above)
        self.tfidf_transformer.fit(texts)
        corpus_tfidf = self.tfidf_transformer.transform(texts)
        dictionary = self.tfidf_transformer.dictionary
        return corpus_tfidf, dictionary
    
    def lsi_to_vecs(self, corpus_lsi):
        lsi_vecs = []
        for c in corpus_lsi:
            vec = [x[1] for x in c]
            lsi_vecs.append(vec)
        return np.array(lsi_vecs)
    
    def make_lsi_model(self, texts):    
        self.corpus_tfidf, self.dictionary = self.build_tfidf(texts)
        if not self.lsi_model:
            self.lsi_model = gensim.models.LsiModel(self.corpus_tfidf, 
                                                    id2word=self.dictionary, 
                                                    num_topics=self.n_dimensions)
        return self
    
    def make_corpus(self, corpus_tfidf):
        lsi_corpus = self.lsi_model[corpus_tfidf]
        return lsi_corpus
    
    def fit(self, texts, *args, **kwargs):
        self.make_lsi_model(texts)
        self.corpus_lsi = self.lsi_model[self.corpus_tfidf]
        return self
    
    def transform(self, texts):
        corpus_tfidf = self.tfidf_transformer.transform(texts)
        corpus_lsi = self.make_corpus(corpus_tfidf)
        #return sparse2full(corpus_lsi, self.n_dimensions)
        return self.lsi_to_vecs(corpus_lsi)
        

## Tf-Idf + LSI Pipeline

In [18]:
lsi_pipe = Pipeline([
        ('tokenizer', Tokenizer(stop_words=stop_words)),
        ('lsi', LsiTransformer(no_below=2, no_above=0.1)),
        ('model', RandomForestClassifier(max_features=15, n_estimators=1000, n_jobs=3))
    ])

In [19]:
lsi_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tokenizer', <__main__.Tokenizer object at 0x7f4d778cfd68>), ('lsi', <__main__.LsiTransformer object at 0x7f4d778cfdd8>), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=15, max_leaf_nodes=None,
            min_im...n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [20]:
# try:
#     lsi_pipe.predict(pd.Series(['']))
# except ValueError as e:
#     raise

In [21]:
print(classification_report(lsi_pipe.predict(X_test), y_test))

             precision    recall  f1-score   support

   conflict       0.79      0.80      0.79       148
   disaster       0.95      0.88      0.91       386
      other       0.60      0.74      0.66       116

avg / total       0.85      0.84      0.84       650



In [22]:
#df['text'].str.len().nlargest()

In [23]:
#df.drop([183, 151, 86, 260, 265], inplace=True)

In [None]:
lsi_pipe.fit(df['text'], df['category'])

In [22]:
joblib.load('../idetect/source/python/relevance.pkl')

KeyError: 60

In [16]:
joblib.dump(lsi_pipe, 'lsi_pipe.pkl')

['lsi_pipe.pkl']

In [None]:
a = pd.Series('evacuated from the flood by the military')
lsi_pipe.predict(a)[0]

## Combining Approaches with Custom Rules Transformer

In [55]:
class Combiner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.kw = KeyWordClassifier()
        self.lsi = lsi_pipe

    def combine_predictions(self, classified, keyword_tagged):
        predictions = []
        for classifier, keyword in zip(classified, keyword_tagged):
            if classifier == keyword:
                predictions.append(keyword)
            elif keyword == 'unknown':
                predictions.append(classifier)
            else:
                predictions.append(classifier)
        return predictions
    
    def fit(self, X, y=None):
        self.lsi.fit(X, y)
        return self
        
    def transform(self, X, *args):
        self.classified = self.lsi.predict(X)
        self.keyword_tagged = self.kw.transform(X)
        return pd.DataFrame({'classified': self.classified, 'key_word': self.keyword_tagged})

In [56]:
from collections import defaultdict

In [57]:
class MultiLabelEncoder(TransformerMixin):
    def fit(self, X, y=None):
        self.le = LabelEncoder()
        self.le.fit(X.iloc[:,0])
        return self
    
    def transform(self, X, y=None):
        columns = X.columns
        for column in columns:
            X[column] = self.le.transform(X[column])
        return X

In [58]:
m = MultiLabelEncoder()
m.fit(predictions)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [120]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

In [59]:
c = Combiner()

In [60]:
c.fit(X_train, y_train)

Combiner()

In [64]:
predictions = c.transform(X_test)

In [69]:
print(classification_report(predictions['key_word'], y_test))

             precision    recall  f1-score   support

   conflict       0.34      0.76      0.47        75
   disaster       0.64      0.86      0.73       256
      other       0.73      0.32      0.44       319

avg / total       0.65      0.58      0.56       650



In [65]:
predictions.head()

Unnamed: 0,classified,key_word
0,conflict,conflict
1,conflict,other
2,other,other
3,disaster,other
4,other,other


In [68]:
print(sklearn.metrics.classification_report(y_test, predictions))

             precision    recall  f1-score   support

   conflict       0.87      0.83      0.85       181
   disaster       0.89      0.95      0.92       337
      other       0.70      0.61      0.65       132

avg / total       0.85      0.85      0.85       650



In [69]:
print(sklearn.metrics.classification_report(y_test, c.classified))

             precision    recall  f1-score   support

   conflict       0.87      0.83      0.85       181
   disaster       0.89      0.95      0.92       337
      other       0.70      0.61      0.65       132

avg / total       0.85      0.85      0.85       650



In [70]:
print(sklearn.metrics.classification_report(y_test, c.keyword_tagged))

             precision    recall  f1-score   support

   conflict       0.86      0.72      0.79       181
   disaster       0.89      0.89      0.89       337
      other       0.57      0.65      0.61       132
    unknown       0.00      0.00      0.00         0

avg / total       0.82      0.80      0.80       650



  'recall', 'true', average, warn_for)


In [87]:
class T1(TransformerMixin):
    
    def fit(self, Xy=None):
        return self
    
    def transform(self, X, y=None):
        X1 = X
        X2 = X*2
        return X1, X2
    
class T2(TransformerMixin):
    
    def fit(self, X1, X2, y=None):
        return self
    
    def transform(self, X1, X2, y=None):
        y = X1 + X2
        return y
    
    def fit_transform(self, X1, X2, y=None):
        return self.fit(X1, X2).transform(X1, X2)
    
    

In [88]:
x = np.array([1,2,3,4,5])

In [89]:
t1 = T1()
x1, x2 = t1.fit_transform(x)

In [90]:
t2 = T2()
t2.fit_transform(x1, x2)

array([ 3,  6,  9, 12, 15])

In [91]:
pipe = Pipeline([
        ('t1', T1()),
        ('t2', T2())
    ])

In [92]:
pipe.fit(x)

Pipeline(steps=[('t1', <__main__.T1 object at 0x11c747dd8>), ('t2', <__main__.T2 object at 0x11c747da0>)])

## Combining Approaches with a Voting Classifier

In [135]:
from sklearn.ensemble import VotingClassifier

In [144]:
vtg = VotingClassifier(estimators=[('kw', kw_clf), ('lsi', lsi_pipe)])

In [175]:
vtg.fit(X_train, y_train)

VotingClassifier(estimators=[('kw', KeyWordClassifier()), ('lsi', Pipeline(steps=[('tfidf', <__main__.LsiTransformer object at 0x12a28c208>), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_imp...ors=1000, n_jobs=3, oob_score=False, random_state=123,
            verbose=0, warm_start=False))]))],
         n_jobs=1, voting='hard', weights=None)

In [178]:
predictions = vtg.transform(X_test)

In [33]:
def tfidf_transform(texts, dictionary=None, tfidf_model=None):
    if not dictionary:
        dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    if not tfidf_model:
        tfidf_model = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    return corpus_tfidf, dictionary, tfidf_model

def lsi_transform(corpus_tfidf, dictionary, lsi_model=None, dimensions=2):
    if not lsi_model:
        lsi_model = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=dimensions)
    corpus_lsi = lsi_model[corpus_tfidf]
    lsi_vecs = lsi_to_vecs(corpus_lsi)
    return lsi_vecs, corpus_lsi, lsi_model

# Relevance

In [7]:
df_relevance = pd.read_csv('../../../data/relevance_training.csv')
df_relevance.drop(['Unnamed: 0'], axis=1, inplace=True)

## Keyword Approach

In [26]:
class RelevanceKeyWordClassifier(BaseEstimator):
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.create_keywords()
    
    def create_keywords(self):
        displacement_tokens = ['evacuated', 'evacuee', 'displaced', 'displacement', 'fled', 'stranded', 'homeless', 
                  'flee', 'rescued', 'trapped', 'shelter', 'camp', 'escape', 'forced', 'migrant', 'run', 'ran']
        self.displacement_stems = [self.stemmer.stem(token) for token in displacement_tokens]
        return self
    
    def tag_by_stem(self, texts, displacement_stems):
        is_displacement = []
        for text in texts:
            mentions = 0
            for stem in self.displacement_stems:
                mentions += text.count(stem)
            if mentions > 0:
                is_displacement.append('yes')
            else:
                is_displacement.append('no')
        return is_displacement
    
    def fit(self, *args):
        return self
    
    def transform(self, X, *args):
        y = self.tag_by_stem(X, self.displacement_stems)
        return y
    
    def predict(self, X, y=None):
        y = self.tag_by_stem(X, self.displacement_stems)

In [27]:
rel_kw_pipeline = Pipeline([
        ('stemmer', Stemmer(stop_words=stop_words)),
        ('tagger', RelevanceKeyWordClassifier())
    ])

In [28]:
predictions = rel_kw_pipeline.fit_transform(df_relevance['text'])

In [29]:
print(classification_report(df_relevance['is_displacement'], predictions))

             precision    recall  f1-score   support

         no       0.92      0.75      0.82      1927
        yes       0.71      0.90      0.79      1313

avg / total       0.83      0.81      0.81      3240



### Machine Learning Approach

In [30]:
rel_ml_pipeline = Pipeline([
        ('tokenizer', Tokenizer(stop_words=stop_words)),
        ('lsi', LsiTransformer(no_below=2, no_above=0.1)),
        ('model', RandomForestClassifier(max_features=12, n_estimators=1000, n_jobs=3))
    ])

In [31]:
train_rel, test_rel = sklearn.model_selection.train_test_split(df_relevance, test_size=0.2)

In [32]:
train_labels_rel = train_rel['is_displacement']
test_labels_rel = test_rel['is_displacement']

In [34]:
rel_ml_pipeline.fit(train_rel['text'], train_labels_rel)

Pipeline(steps=[('tokenizer', <__main__.Tokenizer object at 0x119c71940>), ('lsi', <__main__.LsiTransformer object at 0x119c71d68>), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=12, max_leaf_nodes=None,
            min_impurity...ators=1000, n_jobs=3, oob_score=False,
            random_state=None, verbose=0, warm_start=False))])

In [37]:
predictions = rel_ml_pipeline.predict(test_rel['text'])

In [38]:
print(classification_report(test_labels_rel, predictions))

             precision    recall  f1-score   support

         no       0.73      0.90      0.80       388
        yes       0.77      0.49      0.60       260

avg / total       0.74      0.74      0.72       648



In [363]:
param_grid = [{'max_features': [7, 8, 9, 10, 11, 12],
               'n_jobs': [3],
              }]

rf = RandomForestClassifier(n_estimators=1000, n_jobs=3)
clf = model_selection.GridSearchCV(rf, param_grid=param_grid, cv=5)

#### Keyword and Random Forest Combined

In [43]:
class Combiner(BaseEstimator, TransformerMixin):
    def __init__(self, ml_model, kw_model):
        self.ml_model = ml_model
        self.kw_model = kw_model
        
    def combine_relevance_tags(self, classified, keyword_tagged):
        combined = []
        for classifier, keyword in zip(classified, keyword_tagged):
            if keyword == 'no' and classifier == 'no':
                tag = 'no'
            elif keyword == 'yes' and classifier == 'yes':
                tag = 'yes'
            elif keyword == 'no' and classifier == 'yes':
                tag = 'yes'
            elif keyword == 'yes' and classifier == 'no':
                tag = 'yes'
            combined.append(tag)
        return combined
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, *args):
        ml_tagged = self.ml_model.predict(X)
        kw_tagged = self.kw_model.transform(X)
        combined = self.combine_relevance_tags(ml_tagged, kw_tagged)
        return combined

In [44]:
c = Combiner(rel_ml_pipeline, rel_kw_pipeline)

In [53]:
joblib.dump(c, 'relevance_classifier.pkl')

['relevance_classifier.pkl']

In [49]:
combined_predictions = c.transform(test_rel['text'])

In [50]:
print(classification_report(combined_predictions, test_labels_rel))

             precision    recall  f1-score   support

         no       0.68      0.95      0.80       278
        yes       0.95      0.67      0.78       370

avg / total       0.84      0.79      0.79       648



## Predictions on IDMC Test Set

In [460]:
df_idmc_test = pd.read_csv('../../data/test/all_scraped_test_urls.csv')

## Experiments

In [36]:
class CategoryModel(object):
    def __init__(self, model_path=None):
        self.model = self.load_model(model_path=model_path)

    def load_model(self, model_path=None):
        if model_path and os.path.isfile(model_path):
            clf = joblib.load(model_path)
        else:
            default_model_path = 'category.pkl'
            if os.path.isfile(default_model_path):
                clf = joblib.load(default_model_path)
            else:
                url = 'https://s3-us-west-2.amazonaws.com/idmc-idetect/category_models/category.pkl'
                r = requests.get(url, stream=True)
                if not os.path.isfile(default_model_path):
                    try:
                        os.makedirs(os.path.dirname(default_model_path))
                    except OSError as exc: # Guard against race condition
                        if exc.errno != errno.EEXIST:
                            raise
                with open(default_model_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024):
                        if chunk:  # filter out keep-alive new chunks
                            f.write(chunk)
                clf = joblib.load(default_model_path)
        return clf

    def predict(self, text):
        try:
            category = self.model.predict(pd.Series(text))[0]
        except:
            # if error occurs, classify as most likely category
            category = 'disaster'

        if category == 'disaster':
            return 'disaster'
            #return Category.DISASTER
        elif category == 'conflict':
            return 'conflict'
            #return Category.CONFLICT
        else:
            return 'other'
            #return Category.OTHER