In [3]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk, pydash, math, os, itertools
from pydash import flatten, flatten_deep
from collections import Counter, OrderedDict
from frozendict import frozendict
from humanize import intcomma
from operator import itemgetter
from typing import *
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from itertools import product, combinations
from joblib import Parallel, delayed

In [4]:
df_train = pd.read_csv('dataset/train.csv', index_col=0)
df_test  = pd.read_csv('dataset/test.csv', index_col=0)
df_train

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [5]:
keywords = df_train['keyword'].unique().tolist()
print(len(keywords))
print(keywords)

222
[nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident', 'ambulance', 'annihilated', 'annihilation', 'apocalypse', 'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked', 'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze', 'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood', 'bloody', 'blown%20up', 'body%20bag', 'body%20bagging', 'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse', 'buildings%20burning', 'buildings%20on%20fire', 'burned', 'burning', 'burning%20buildings', 'bush%20fires', 'casualties', 'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency', 'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided', 'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge', 'deluged', 'demolish', 'demolished', 'demolition', 'derail', 'derailed', 'derailment', 'desolate', 'desolation', 'destroy', 'destroyed', 'destruction', 'detonate', '

In [6]:
def tokenize_df(
    dfs: List[pd.DataFrame], 
    keys          = ('text', 'keyword', 'location'), 
    stemmer       = True, 
    preserve_case = True, 
    reduce_len    = False, 
    strip_handles = True,
    use_stopwords = True,
    **kwargs,
) -> List[List[str]]:
    tokenizer = nltk.TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles) 
    porter    = nltk.PorterStemmer()
    stopwords = set(nltk.corpus.stopwords.words('english') + [ 'nan' ])
    output    = []
    for df in flatten([ dfs ]):
        for index, row in df.iterrows():
            tokens = flatten([ tokenizer.tokenize(str(row[key] or "")) for key in keys])
            if use_stopwords:
                tokens = [ token for token in tokens if token.lower() not in stopwords and len(token) >= 2]                
            if stemmer:
                tokens = [ porter.stem(token) for token in tokens ]
            output.append(tokens)
    return output

In [7]:
tokenize_df(df_train)[:2]

[['deed', 'reason', '#earthquak', 'may', 'allah', 'forgiv', 'us'],
 ['forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada']]

In [8]:
def word_frequencies(df, **kwargs) -> Dict[int, Counter]:
    tokens = {
        0: flatten(tokenize_df( df[df['target'] == 0], **kwargs )),
        1: flatten(tokenize_df( df[df['target'] == 1], **kwargs )),
    }
    freqs = { 
        target: Counter(dict(Counter(tokens[target]).most_common())) 
        for target in [0, 1]
    }
    return freqs

In [9]:
freqs = word_frequencies(df_train)
print('freqs[0]', len(freqs[0]), freqs[0].most_common(10))
print('freqs[1]', len(freqs[1]), freqs[1].most_common(10))

freqs[0] 12811 [('...', 421), ('new', 320), ('like', 309), ('get', 224), ('bodi', 216), ("i'm", 207), ('scream', 194), ('û_', 171), ('burn', 159), ('obliter', 157)]
freqs[1] 10795 [('...', 637), ('fire', 303), ('bomb', 242), ('new', 207), ('suicid', 204), ('evacu', 185), ('flood', 176), ('û_', 171), ('derail', 170), ('kill', 160)]


In [16]:
def inverse_document_frequency( tokens: List[str] ) -> Counter:
    tokens = flatten_deep(tokens)
    idf = {
        token: math.log( len(tokens) / count ) 
        for token, count in Counter(tokens).items()
    }
    idf = Counter(dict(Counter(idf).most_common()))
    return idf

In [17]:
def inverse_document_frequency_df( dfs ) -> Counter:
    tokens = flatten_deep([ tokenize_df(df) for df in flatten([ dfs ]) ])
    return inverse_document_frequency(tokens)

In [18]:
idf = inverse_document_frequency_df([ df_train, df_test ])
list(reversed(idf.most_common()))[:20]

[('...', 4.467633783633229),
 ('new', 5.142574999696602),
 ('fire', 5.360577151510393),
 ('like', 5.413220884995814),
 ('û_', 5.568216516288637),
 ('bomb', 5.654690114292464),
 ('get', 5.667677309819275),
 ('burn', 5.792840452773281),
 ('usa', 5.833148176261374),
 ('emerg', 5.8539281447531195),
 ('flood', 5.89136567182525),
 ("i'm", 5.918991738100181),
 ('bodi', 5.935941296413954),
 ('attack', 5.967781902269613),
 ('via', 5.97072741249937),
 ('fatal', 6.000669769114448),
 ('crash', 6.000669769114448),
 ('suicid', 6.015984004087491),
 ('build', 6.025286396749804),
 ('evacu', 6.034676137099644)]

In [19]:
def extract_features(df, freqs, use_idf=True, use_log=True, **kwargs) -> np.array:
    features = []
    tokens   = tokenize_df(df, **kwargs)
    for n in range(len(tokens)):
        bias     = 1
        positive = 1
        negative = 1        
        for token in tokens[n]:
            if use_idf:
                positive += freqs[0].get(token, 0) * idf.get(token, 1) 
                negative += freqs[1].get(token, 0) * idf.get(token, 1)
            else:
                positive += freqs[0].get(token, 0) 
                negative += freqs[1].get(token, 0) 
        features.append([ positive, negative ])  
    features = np.array(features)
    if use_log:
        features = np.log(features)
    return features

In [20]:
Y_train = df_train['target'].to_numpy()
X_train = extract_features(df_train, freqs)
X_test  = extract_features(df_test,  freqs)
print('df_train', df_train.shape)
print('df_test ', df_test.shape)
print('Y_train ', Y_train.shape)
print('X_train ', X_train.shape)
print('X_test  ', X_test.shape)
print(X_test[:5])

df_train (7613, 4)
df_test  (3263, 3)
Y_train  (7613,)
X_train  (7613, 2)
X_test   (3263, 2)
[[6.92293033 7.38327619]
 [7.14708523 7.00546676]
 [7.29343584 8.00157928]
 [6.36825736 5.77734926]
 [5.70946644 7.56250014]]


In [21]:
def predict_df(df_train, df_test, **kwargs):
    freqs   = word_frequencies(df_train, **kwargs)
    Y_train = df_train['target'].to_numpy()
    X_train = extract_features(df_train, freqs, **kwargs)
    X_test  = extract_features(df_test,  freqs, **kwargs) if df_train is not df_test else X_train
    model      = LinearRegression().fit(X_train, Y_train)
    prediction = model.predict(X_test)
    prediction = np.round(prediction).astype(int)
    return prediction

In [22]:
def get_train_f1_score(splits=3, **kwargs):
    f1 = 0.0
    for _ in range(splits):
        train, test = train_test_split(df_train, test_size=1/splits)      
        prediction  = predict_df(train, test, **kwargs)
        Y_train     = test['target'].to_numpy()
        f1         += f1_score(Y_train, prediction, average='weighted') / splits
    return f1

In [23]:
def train_f1_score_hyperparameter_search():
    results = Counter()
    jobs    = []
    for keys in [('text', 'keyword', 'location')]: 
        strip_handles = 1  
        for stemmer, preserve_case, reduce_len, use_stopwords, use_idf, use_log in product([1,0],[1,0],[1,0],[1,0],[1,0],[1,0]):
            def fn(keys, stemmer, preserve_case, reduce_len, strip_handles, use_stopwords, use_idf, use_log):
                kwargs = {
                    "stemmer":        stemmer,          
                    "preserve_case":  preserve_case, 
                    "reduce_len":     reduce_len, 
                    "use_stopwords":  use_stopwords,    
                    "use_idf":        use_idf,          
                    "use_log":        use_log,          
                }
                label = frozendict({**kwargs})
                f1 = get_train_f1_score(**kwargs)
                return (label, f1)
            jobs.append(delayed(fn)(keys, stemmer, preserve_case, reduce_len, strip_handles, use_stopwords, use_idf, use_log))
    results = Counter(dict(Parallel(-1)(jobs)))
    results = Counter(dict(results.most_common())) 
    return results

In [24]:
results = train_f1_score_hyperparameter_search()
for label, value in results.items():
    print(f'{value:.5f} |', "  ".join(f"{k.split('_')[-1]} = {v}" for k,v in label.items()))
print('train_f1_score = ', get_train_f1_score())
df_submission = pd.DataFrame({
    "id":     df_test.index,
    "target": predict_df(df_train, df_test)
})
df_submission.to_csv('submission.csv', index=False)

0.76096 | stemmer = 1  case = 1  len = 1  stopwords = 1  idf = 1  log = 1
0.76036 | stemmer = 1  case = 1  len = 0  stopwords = 1  idf = 1  log = 1
0.75823 | stemmer = 1  case = 0  len = 1  stopwords = 1  idf = 1  log = 1
0.75786 | stemmer = 1  case = 1  len = 0  stopwords = 1  idf = 1  log = 0
0.75376 | stemmer = 1  case = 1  len = 1  stopwords = 1  idf = 1  log = 0
0.75364 | stemmer = 0  case = 0  len = 0  stopwords = 1  idf = 0  log = 1
0.75110 | stemmer = 1  case = 0  len = 0  stopwords = 1  idf = 1  log = 1
0.75097 | stemmer = 0  case = 1  len = 0  stopwords = 1  idf = 0  log = 1
0.75070 | stemmer = 1  case = 0  len = 0  stopwords = 1  idf = 0  log = 1
0.74989 | stemmer = 0  case = 0  len = 1  stopwords = 1  idf = 0  log = 1
0.74751 | stemmer = 1  case = 0  len = 1  stopwords = 1  idf = 1  log = 0
0.74703 | stemmer = 1  case = 0  len = 0  stopwords = 1  idf = 1  log = 0
0.74650 | stemmer = 0  case = 0  len = 0  stopwords = 1  idf = 1  log = 1
0.74470 | stemmer = 0  case = 1  len =