In [265]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import re     
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV

In [266]:
SEED = 42

In [267]:
# nltk.download('stopwords')
# nltk.download('punkt')

In [268]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords

from nltk.stem.porter import *
import string

***1*** if the tweet is describing a real disaster, and ***0*** otherwise

In [269]:
train = pd.read_csv('../../data/real_train.csv', encoding='utf-8').set_index('id')
test = pd.read_csv('../../data/real_test.csv',encoding='utf-8').set_index('id')

In [270]:
y = train['target']
train = train.drop(['location', 'keyword', 'target'], axis=1)
test = test.drop(['location', 'keyword'], axis=1)

In [271]:
idx = train.shape[0]
df = pd.concat([train, test])

In [272]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_emoji(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def decontracted(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    #also delete @ and #
    text = re.sub(r"[@+#+]", " ", text)
    return text

def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

cachedStopWords = stopwords.words("english")

def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in cachedStopWords])
    return text

stemmer = SnowballStemmer('english')

In [273]:
#The function takes as input dataset and columns to process
def clean_data(df, *variables):
    for variable in variables:
        df[variable] = df[variable].apply(lambda x : remove_URL(x))
        df[variable] = df[variable].apply(lambda x : remove_emoji(x))
        df[variable] = df[variable].apply(lambda x : decontracted(x))
        df[variable] = df[variable].apply(lambda x : remove_punct(x))
        df[variable] = df[variable].apply(lambda x : remove_stopwords(x))
        df[variable] = df[variable].apply(lambda x : stemmer.stem(x))

    return df

In [274]:
clean_data(df, 'text')

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
1,our deeds reason earthquake may allah forgive us
4,forest fire near la ronge sask canada
5,all residents asked ishelter place notified of...
6,13000 people receive wildfires evacuation orde...
7,just got sent photo ruby alaska smoke wildfire...
...,...
10861,earthquake safety los angeles ûò safety faste...
10865,storm ri worse last hurricane my cityamp3other...
10868,green line derailment chicago
10874,meg issues hazardous weather outlook hwo


In [275]:
train = df[:idx]
test = df[idx:]

`max_df` is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

>`max_df` = 0.50 means "ignore terms that appear in more than 50% of the documents".
>`max_df` = 25 means "ignore terms that appear in more than 25 documents".

The default `max_df` is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

`min_df` is used for removing terms that appear too infrequently. For example:

>`min_df` = 0.01 means "ignore terms that appear in less than 1% of the documents".
>`min_df` = 5 means "ignore terms that appear in less than 5 documents".

The default `min_df` is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms

In [276]:
MAX_DF = 0.9
MIN_COUNT = 5
NGRAMS = (1, 2)

In [277]:
TOKEN_RE = re.compile(r'[a-z]+|-?\d*[-.,]?\d+|\S')

def tokenize_text_simple_regex(txt, min_token_size=1):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)

    return [token for token in all_tokens if len(token) >= min_token_size]


In [278]:
vector = TfidfVectorizer(tokenizer=tokenize_text_simple_regex,
                            min_df=MIN_COUNT, max_df=MAX_DF,
                            ngram_range = NGRAMS)
vector.fit(train['text'])

TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 2),
                tokenizer=<function tokenize_text_simple_regex at 0x7f3d1134b0e0>)

In [279]:
#convert dataset into sparse matrix
def vectorize_data(df, vectorizer):
    return vectorizer.transform(df)

In [280]:
train_vect = vectorize_data(train['text'], vector)
X_train, X_holdout, y_train, y_holdout = train_test_split(train_vect, y, 
                                            train_size=0.7, random_state=SEED)

In [281]:
clf1 = LogisticRegression(random_state=SEED).fit(X_train, y_train)

In [282]:
print('F1 score (train) %.3f' % f1_score(y_train, clf1.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf1.predict(X_holdout)))

F1 score (train) 0.837
F1 score (holdout) 0.748


In [283]:
def submission(df, vectorizer, clf, name_submisson):
    X_test = vectorize_data(df, vectorizer)
    pred = clf.predict(X_test)
    return pd.DataFrame(pred, index=test.index, columns=['target']).to_csv(name_submisson)

In [284]:
submission(test['text'], vector, clf1, 'submission.csv')
#KAGGLE 0.79037

In [285]:
train = pd.read_csv('../../data/real_train.csv', encoding='utf-8').set_index('id')
test = pd.read_csv('../../data/real_test.csv',encoding='utf-8').set_index('id')

In [286]:
train['location'].isna().sum()/len(train['location']), test['location'].isna().sum()/len(test['location'])

(0.33272034677525286, 0.3386454183266932)

In [287]:
train = train.drop(['keyword', 'target'], axis=1)
test = test.drop(['keyword'], axis=1)

In [288]:
train['location'] = train['location'].fillna(' ')
test['location'] = test['location'].fillna(' ')
idx = train.shape[0]
df = pd.concat([train, test])

In [289]:
clean_data(df, 'location', 'text')

Unnamed: 0_level_0,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,,our deeds reason earthquake may allah forgive us
4,,forest fire near la ronge sask canada
5,,all residents asked ishelter place notified of...
6,,13000 people receive wildfires evacuation orde...
7,,just got sent photo ruby alaska smoke wildfire...
...,...,...
10861,,earthquake safety los angeles ûò safety faste...
10865,,storm ri worse last hurricane my cityamp3other...
10868,,green line derailment chicago
10874,,meg issues hazardous weather outlook hwo


In [290]:
df["text"] = df["location"] + " " + df["text"]
df = df.drop(['location'], axis=1)
train = df[:idx]
test = df[idx:]

In [291]:
vector = TfidfVectorizer(tokenizer=tokenize_text_simple_regex,
                            min_df=MIN_COUNT, max_df=MAX_DF,
                            ngram_range = NGRAMS)
vector.fit(train['text'])
train_vect = vectorize_data(train['text'], vector)
X_train, X_holdout, y_train, y_holdout = train_test_split(train_vect, y, 
                                            train_size=0.7, random_state=SEED)

In [292]:
clf2 = LogisticRegression(random_state=SEED).fit(X_train, y_train)
print('F1 score (train) %.3f' % f1_score(y_train, clf2.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf2.predict(X_holdout)))

F1 score (train) 0.839
F1 score (holdout) 0.747


Looks like additioanl feature with location does not impact on result


F1 score (train) 0.840
F1 score (holdout) 0.744

In [293]:
clf3 = RidgeClassifier(random_state=SEED).fit(X_train, y_train)
print('F1 score (train) %.3f' % f1_score(y_train, clf3.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf3.predict(X_holdout)))

F1 score (train) 0.898
F1 score (holdout) 0.734


In [294]:
params = {'alpha' : [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 5, 7, 10]}
grid = GridSearchCV(RidgeClassifier(random_state=SEED), param_grid=params, 
                    scoring='f1', cv=5).fit(X_train, y_train)
best_clf = grid.best_estimator_
grid.best_params_

{'alpha': 2}

In [295]:
print('F1 score (train) %.3f' % f1_score(y_train, best_clf.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, best_clf.predict(X_holdout)))

F1 score (train) 0.871
F1 score (holdout) 0.741


In [296]:
submission(test['text'], vector, best_clf, 'submission_2.csv')