In [11]:
import pandas as pd
import spacy
import string
import re
from spacy.lang.en.stop_words import STOP_WORDS

from spacy.lang.en import English
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from sklearn.base import TransformerMixin,BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import string
from spacy.matcher import Matcher


### Spacy to preprocess text into lemmatised tokens
### Sklearn pipeline models:
    1) Countvectoriser
    2) Tfidf Vectoriser
    3) Random Forest
    4) Naive Bayes

In [12]:
%pip install spacy && python -m spacy download en

In [13]:
df1= pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

boot = False # resample the data to 10000 samples

In [14]:
# concat all the words into the same columns
# not used here
def concat_words(data):
    keyword_filled = data.keyword.fillna('')
    location_filled = data.location.fillna('')
    data['all_words'] = data.text + ' ' + keyword_filled + ' ' + location_filled
    return data

In [15]:
# make a copy
df_train = df1
df_test = df2

In [16]:
# Extract unique keywords
keywords = df_train.keyword.fillna('')
keywords = keywords.unique().tolist()
result =''
for ele in keywords:
    result += ele
    result += ' '
result = result.replace('%20', ' ')
result

' ablaze accident aftershock airplane accident ambulance annihilated annihilation apocalypse armageddon army arson arsonist attack attacked avalanche battle bioterror bioterrorism blaze blazing bleeding blew up blight blizzard blood bloody blown up body bag body bagging body bags bomb bombed bombing bridge collapse buildings burning buildings on fire burned burning burning buildings bush fires casualties casualty catastrophe catastrophic chemical emergency cliff fall collapse collapsed collide collided collision crash crashed crush crushed curfew cyclone damage danger dead death deaths debris deluge deluged demolish demolished demolition derail derailed derailment desolate desolation destroy destroyed destruction detonate detonation devastated devastation disaster displaced drought drown drowned drowning dust storm earthquake electrocute electrocuted emergency emergency plan emergency services engulfed epicentre evacuate evacuated evacuation explode exploded explosion eyewitness famine

In [17]:
doc = lemma_tokenizer(result)
tokens = [cleanup(word) for word in doc if isNoise(word)==False ]

NameError: name 'lemma_tokenizer' is not defined

In [18]:
train, test = train_test_split(df_train, test_size=0.1, random_state=42)
if boot == True: train = train.sample(n=10000, replace = True)

In [19]:
print(len(df1), len(train), len(test))

7613 6851 762


In [20]:
# creating NLP pipeline
# working progress
lang = "en_core_web_sm"
pipeline = ["tagger"]
cls = spacy.util.get_lang_class(lang)   # 1. Get Language instance, e.g. English()
nlp = cls()                             # 2. Initialize it
for name in pipeline:
    component = nlp.create_pipe(tokenizer)   # 3. Create the pipeline components
    nlp.add_pipe(component)             # 4. Add the component to the pipeline
nlp.from_disk(model_data_path)  

ImportError: [E048] Can't import language en_core_web_sm from spacy.lang: No module named 'spacy.lang.en_core_web_sm'

### function to clean words

In [None]:
# initiate nlp object
nlp = spacy.load('en_core_web_sm')
# define which pos to filter out
noisy_pos_tags = ['PROP', 'NUM', 'SYM']
pattern1 = [{"LEMMA": {"IN": keywords}},
            {"POS": "NOUN"}]
disable_list = ["ner", "parser"]
# functions to filter out noises and clean words
def isNoise(token):     
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True 
        #print('pos')
    elif token.is_stop == True:
        is_noise = True
        #print('stop')
    elif token.is_punct == True:
        is_noise = True
        #print('punct')
    elif token.lemma_ == '...':
        is_noise = True
        #print('...')
    elif token.like_url == True:
        is_noise = True
        #print('url')
    else:
        is_noise = False
    return is_noise 
    

def cleanup(token, lower = True):
    if lower:
        token = token.lemma_.lower()
    return token.strip()


In [None]:
# Simple tokenizer with lemma words (and lowercase, rm stopword, symbols, number)
def lemma_tokenizer(sentence):
    mytokens = nlp(sentence, disable_list)
    # Lemmatizing each token and converting each token into lowercase
    tokens = [cleanup(word) for word in mytokens if isNoise(word)==False ]
    # return preprocessed list of tokens
    return mytokens

In [None]:
# more complicated tokenizer with tagging:  POS + Entity + Special case

def pos_tokenizer (sentence):
    mytokens = nlp(sentence, disable_list)
    lookups = Lookups()
    lemmatizer = Lemmatizer(lookups)
    word =[]
    pos=[]
    for token in sent:
        word.append(token.text)
        pos.append(token.pos_)
    mytokens = [ word for word in mytokens if word not in stop_words and word not in SYMBOLS ]
    return mytokens

### These functions are for the pipeline for ML models

In [None]:
# create dense transformer
class ToDenseTransformer(BaseEstimator,TransformerMixin):
    # define the transform operation
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    # no paramter to learn this case
    # fit just returns an unchanged object
    def fit(self, X, y=None, **fit_params):
        return self


In [None]:
def printNMostInformative(vectorizer, clf, N):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Not Disaster Best Words: ")
    for feat in topClass1:
        print(feat)
    print("Disaster Best words: ")
    for feat in topClass2:
        print(feat)

In [None]:
# Prepare Training and Verification Data
X_train = train['all_words'].tolist()
Y_train = train['target'].tolist()
X_test = test['all_words'].tolist()
Y_test = test['target'].tolist()

In [None]:
# Bag of words vectorizer
vectorizer = CountVectorizer(tokenizer=lemma_tokenizer)
clf = LogisticRegression(solver='liblinear')
pipe = Pipeline(
    [("vect", vectorizer),
    ('normal', ToDenseTransformer()),
    ("clf", clf)
    ])

In [None]:
# train
pipe.fit(X_train, Y_train)
# test
preds = pipe.predict(X_test)
accu = preds == Y_test
print("accuracy: for countvectorizer model is:", accu.mean())
printNMostInformative(vectorizer, clf, 10)

In [None]:
# Tfidf vectorizer
pipe_Tfidf = Pipeline(
    [("vect", vectorizer),
    ('tfid', TfidfTransformer()),
    ("clf", clf)
    ])
pipe_Tfidf.fit(X_train, Y_train)
preds = pipe_Tfidf.predict(X_test)
accu = preds == Y_test
print("accuracy: for Tfidf model is:", accu.mean())
# print most informative words with highest coeff for Not Diaster and Diaster
printNMostInformative(vectorizer, clf, 10)

### Other non-vectorised Classifiers
1) Random Forest

2) Naive Bayes

In [None]:
# Random Forest Classifier
RF_clf = RandomForestClassifier(n_estimators=10)
pipe_RF = Pipeline(
    [("vect", vectorizer),
    ("clf", RF_clf)
    ])
pipe_RF.fit(X_train, Y_train)
preds = pipe_RF.predict(X_test)
accu = preds == Y_test
print("accuracy: for Random Forest model is:", accu.mean())

In [None]:
# Naive Bayes Classifier
pipe_bayes = Pipeline(
    [("vect", vectorizer),
    ('bayes', MultinomialNB())
    ])
pipe_bayes.fit(X_train, Y_train)
preds = pipe_bayes.predict(X_test)
accu = preds == Y_test
print("accuracy: for Bayes model is:", accu.mean())


### Spacy CNN

In [None]:
a = list(range(0,100,2))
print(type(*l) for l in a)