In [1]:
# Imports
import pandas as pd
import re
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
ids = test_df.id.values
test_df.drop("id", axis=1, inplace=True)
train_df.drop("id", axis=1, inplace=True)


In [2]:
# Let's inspect the  keywords and locations

kwds = train_df.keyword.value_counts()
locs = train_df.location.value_counts()

print(kwds)
print("*"*50)
print(locs)

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64
**************************************************
USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: location, Length: 3341, dtype: int64


In [3]:
# Locations are inconsistent in scope (countries, cities, GPS coords),
# encoding issues with diacritics, and some plain weird values
special_chars = re.compile("[!@#$%^&-\*\.\-\|_?/\[\]:\-\+]")
spaces = re.compile("\s+")
lead_trail_space = re.compile("(^\s|\s$)")
nums = re.compile(".*\d")
non_letters = re.compile("\W")
diacritics = re.compile("^(û|å).*")
links = re.compile("http")


def process_location(df):
    df.location = df.location.str.replace("Ì©", "e")
    df.location = df.location.str.lower()
    df.location = df.location.str.replace(nums, "")
    df.location = df.location.str.replace(special_chars, "")
    df.location = df.location.str.replace(spaces, " ")
    df.location = df.location.str.replace(",", "")
    df.location = df.location.str.replace(diacritics, "")
    df.location = df.location.str.replace(links, "")
    
    uncommon_locs = {k: None for k,v in Counter(df.location.values).items() if v == 1}
    df.location.replace(uncommon_locs, inplace=True)
    
    return df

train_df = process_location(train_df)
test_df = process_location(test_df)
train_df.location.value_counts()

                              164
usa                           108
new york                       75
london                         50
united states                  50
                             ... 
birmingham uk                   2
lisbon portugal                 2
washington dc charlotte nc      2
alexandria va                   2
kashmir                         2
Name: location, Length: 519, dtype: int64

In [4]:
def process_keyword(df):
    df.keyword = df.keyword.str.replace("Ì©", "e")
    df.keyword = df.keyword.str.lower()
    df.keyword = df.keyword.str.replace(nums, "")
    df.keyword = df.keyword.str.replace(special_chars, "")
    df.keyword = df.keyword.str.replace(spaces, " ")
    df.keyword = df.keyword.str.replace(",", "")
    df.keyword = df.keyword.str.replace(diacritics, "")
    df.keyword = df.keyword.str.replace(links, "")
    
    uncommon = {k: None for k,v in Counter(df.keyword.values).items() if v == 1}
    df.keyword.replace(uncommon, inplace=True)
    
    return df


train_df = process_keyword(train_df)
test_df = process_keyword(test_df)
train_df.keyword.value_counts()

storm         104
disaster      103
fire           90
fires          88
emergency      79
             ... 
zone           24
rescue         22
epicentre      12
threat         11
inundation     10
Name: keyword, Length: 205, dtype: int64

In [5]:
"""Process the text in much the same way as the location. This time we'll keep the 
hashtags as they could prove valuable"""
special_chars = re.compile("[!@$%^&-\*\.\-\|_?/\[\]:\-\+]")

def process_text(df):
    df.text = df.text.str.replace(special_chars, "")
    df.text = df.text.str.lower()
    df.text = df.text.str.replace(links, "")
    
    return df
    
train_df = process_text(train_df)
test_df = process_text(test_df)

train_df.text

0       our deeds are the reason of this #earthquake m...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       13,000 people receive #wildfires evacuation or...
4       just got sent this photo from ruby #alaska as ...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    ariaahrary thetawniest the out of control wild...
7610    m194 0104 utc5km s of volcano hawaii tcozdtoyd...
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

In [6]:
# One-hot encode the categorical variables
cat_vars = ["location", "keyword"]

def one_hot_encode(colname, df):
    one_hot = pd.get_dummies(df[f"{colname}"], prefix=f"{colname}")
    df.drop(colname, axis=1, inplace=True)
    df = df.join(one_hot)
    return df

for var in cat_vars:
    train_df = one_hot_encode(var, train_df)
    test_df = one_hot_encode(var, test_df)


In [7]:
"""Let's implement term frequency inverse document frequency from scratch because why not
Term frequency - whether the term appears in the current document
Inverse doc frequency - positive docs containing this term / all positive docs
"""
import nltk
import json
from collections import Counter
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words("english"))

all_docs = pd.Series(
    set(doc.split()) - stopwords
    for doc in train_df.text
)
pos_docs = pd.Series(
    set(doc.split()) - stopwords
    for doc in train_df[train_df.target == 1].text
)
neg_docs = pd.Series(
    set(doc.split()) - stopwords
    for doc in train_df[train_df.target == 0].text
)

pos_terms = [word for doc in pos_docs for word in doc]
pos_terms = {k for k, v in Counter(pos_terms).items() if v >= 6}


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
class Term:
    def __init__(self, term, docs):
        self.term = term
        total_docs = len(docs)
        docs_with_term = len(pd.Series(i for i in docs if term in i))
        self.doc_freq = docs_with_term / total_docs

In [9]:
# calculate document frequencies for all terms
term_freqs = [
    Term(term=t, docs=pos_docs) for t in pos_terms
]

In [10]:
# Add each term as a column in the dataframe

for t in term_freqs:
    train_df[f"term_{t.term}"] = train_df.text.str.contains(t.term).astype(int)
    test_df[f"term_{t.term}"] = test_df.text.str.contains(t.term).astype(int)


  after removing the cwd from sys.path.
  """


In [11]:
train_df.drop("text", axis=1, inplace=True)
test_df.drop("text", axis=1, inplace=True)
train_df.fillna(0, axis=1, inplace=True)
test_df.fillna(0, axis=1, inplace=True)

In [12]:
"""The one-hot encoding process has added some columns unique to each dataset. We'll substitute 0s
for these unseen features in both train and test. This is necessary because without it, the predictor
will encounter feature vectors that it hasn't been trained with. 
"""


train_feats = set([i for i in train_df.columns if i != "target"])
test_feats = set(test_df.columns)

feats_not_in_train = list(test_feats-train_feats)
dummy_df = pd.DataFrame.from_records([{col: 0} for col in feats_not_in_train])
train_df = pd.concat([train_df, dummy_df], axis=1)
train_df.fillna(0, inplace=True)

feats_not_in_test = list(train_feats - test_feats)
dummy_df = pd.DataFrame.from_records([{col: 0} for col in feats_not_in_test])
test_df = pd.concat([test_df, dummy_df], axis=1)
test_df.fillna(0, inplace=True)

# Make sure the feature columns ordered the same in train/test
# Otherwise, they lose their meaning
train_df = train_df.reindex(sorted(train_df.columns), axis=1)
test_df = test_df.reindex(sorted(test_df.columns), axis=1)

In [13]:
"""Create train/test split for eval"""
X = train_df[[i for i in train_df.columns if i != "target"]]
y = train_df["target"]

train_data, test_data, train_labels, test_labels  = train_test_split(X, y, test_size=0.05)

Norms supported by each logistic regression solver

‘lbfgs’ - [‘l2’, None]

‘liblinear’ - [‘l1’, ‘l2’]

‘newton-cg’ - [‘l2’, None]

‘newton-cholesky’ - [‘l2’, None]

‘sag’ - [‘l2’, None]

‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, None]



In [14]:
# Train and evaluate logistic regression model 
params = {
    "random_state": 42,
    "penalty": "l1",
    "solver": "liblinear",
    "max_iter": 1000,
    "l1_ratio": None
}

log_reg = LogisticRegression(**params).fit(train_data, train_labels)
preds = log_reg.predict(test_data)
tn, fp, fn, tp = confusion_matrix(test_labels, preds).ravel()
acc = accuracy_score(test_labels, preds)
scores = {
    "acc": acc,
    "tp": tp,
    "tn": tn,
    "fp": fp,
    "fn": fn
}
print(scores)

{'acc': 0.7847769028871391, 'tp': 109, 'tn': 190, 'fp': 26, 'fn': 56}


In [15]:
# Create submission file
answer_preds = log_reg.predict(test_df)
d = {"id": ids, "target": answer_preds}
answer_df = pd.DataFrame(d)
answer_df.to_csv("submission.csv", index=False)