In [973]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import re     
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import coo_matrix, hstack

In [974]:
SEED = 42

In [975]:
# nltk.download('stopwords')
# nltk.download('punkt')
#nltk.download('wordnet')

In [976]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize

from nltk.corpus import stopwords

from nltk.stem.porter import *
import string

***1*** if the tweet is describing a real disaster, and ***0*** otherwise

In [977]:
train = pd.read_csv('../../data/real_train.csv', encoding='utf-8').set_index('id')
test = pd.read_csv('../../data/real_test.csv',encoding='utf-8').set_index('id')

In [978]:
train.isnull().sum(), test.isnull().sum()

(keyword       61
 location    2533
 text           0
 target         0
 dtype: int64,
 keyword       26
 location    1105
 text           0
 dtype: int64)

In [979]:
y = input_train['target']
train['location'] = train['location'].fillna('No_location')
test['location'] = test['location'].fillna('No_location')
train['keyword'] = train['keyword'].fillna('No_keyword')
test['keyword'] = test['keyword'].fillna('No_keyword')
train = train.drop(['target'], axis=1)

In [980]:
idx = train.shape[0]
df = pd.concat([train, test])

In [981]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_emoji(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def decontracted(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    #also delete @ and #
    text = re.sub(r"[@+#+]", " ", text)
    return text

def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

cachedStopWords = stopwords.words("english")

def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in cachedStopWords])
    return text

#stemmer = SnowballStemmer('english')
#stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [982]:
#The function takes as input dataset and columns to process
def clean_data(df, *variables):
    for variable in variables:
        df[variable] = df[variable].apply(lambda x : remove_URL(x))
        df[variable] = df[variable].apply(lambda x : remove_emoji(x))
        df[variable] = df[variable].apply(lambda x : decontracted(x))
        df[variable] = df[variable].apply(lambda x : remove_punct(x))
        df[variable] = df[variable].apply(lambda x : remove_stopwords(x))
        #df[variable] = df[variable].apply(lambda x : stemmer.stem(x))
        df[variable] = df[variable].apply(lambda x : lemmatizer.lemmatize(x))

    return df

In [983]:
clean_data(df, 'keyword', 'location', 'text')

Unnamed: 0_level_0,keyword,location,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Nokeyword,Nolocation,Our Deeds Reason earthquake May ALLAH Forgive us
4,Nokeyword,Nolocation,Forest fire near La Ronge Sask Canada
5,Nokeyword,Nolocation,All residents asked ishelter place notified of...
6,Nokeyword,Nolocation,13000 people receive wildfires evacuation orde...
7,Nokeyword,Nolocation,Just got sent photo Ruby Alaska smoke wildfire...
...,...,...,...
10861,Nokeyword,Nolocation,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10865,Nokeyword,Nolocation,Storm RI worse last hurricane My cityamp3other...
10868,Nokeyword,Nolocation,Green Line derailment Chicago
10874,Nokeyword,Nolocation,MEG issues Hazardous Weather Outlook HWO


In [984]:
df['text'] = df['location'] + " " + df['text']
df_keyword = df.drop(['location', 'text'], axis=1)
df = df.drop(['location', 'keyword'], axis=1)
train = df[:idx]
test = df[idx:]

`max_df` is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

>`max_df` = 0.50 means "ignore terms that appear in more than 50% of the documents".
>`max_df` = 25 means "ignore terms that appear in more than 25 documents".

The default `max_df` is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

`min_df` is used for removing terms that appear too infrequently. For example:

>`min_df` = 0.01 means "ignore terms that appear in less than 1% of the documents".
>`min_df` = 5 means "ignore terms that appear in less than 5 documents".

The default `min_df` is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms

In [985]:
MAX_DF = 0.8
MIN_COUNT = 5
NGRAMS = (3, 5)

In [986]:
TOKEN_RE = re.compile(r'[a-z]+|-?\d*[-.,]?\d+|\S')

def tokenize_text_simple_regex(txt, min_token_size=2):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)

    return [token for token in all_tokens if len(token) >= min_token_size]


In [987]:
vector = TfidfVectorizer(#tokenizer=tokenize_text_simple_regex,
                            analyzer='char',
                            min_df=MIN_COUNT, max_df=MAX_DF,
                            ngram_range = NGRAMS)
vector.fit(train['text'])

TfidfVectorizer(analyzer='char', max_df=0.8, min_df=5, ngram_range=(3, 5))

In [988]:
#convert dataset into sparse matrix
def vectorize_data(df, vectorizer):
    return vectorizer.transform(df)

In [989]:
train_vect = vectorize_data(train['text'], vector)
test_vect = vectorize_data(test['text'], vector)

In [990]:
train_vect, test_vect

(<7613x45139 sparse matrix of type '<class 'numpy.float64'>'
 	with 1465915 stored elements in Compressed Sparse Row format>,
 <3263x45139 sparse matrix of type '<class 'numpy.float64'>'
 	with 622198 stored elements in Compressed Sparse Row format>)

In [991]:
df_keyword = df_keyword.to_dict(orient='records')
dv_X = DictVectorizer(sparse=True)
df_encoded = dv_X.fit_transform(df_keyword)

In [992]:
df_encoded

<10876x214 sparse matrix of type '<class 'numpy.float64'>'
	with 10876 stored elements in Compressed Sparse Row format>

In [993]:
train_encoded = df_encoded[:idx]
test_encoded = df_encoded[idx:]

In [994]:
X = hstack([train_vect, train_encoded])
X_test = hstack([test_vect, test_encoded])

In [995]:
X, X_test

(<7613x45353 sparse matrix of type '<class 'numpy.float64'>'
 	with 1473528 stored elements in COOrdinate format>,
 <3263x45353 sparse matrix of type '<class 'numpy.float64'>'
 	with 625461 stored elements in COOrdinate format>)

In [996]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, 
                                            train_size=0.7, random_state=SEED)

In [997]:
clf1 = LogisticRegression(random_state=SEED).fit(X_train, y_train)

In [998]:
print('F1 score (train) %.3f' % f1_score(y_train, clf1.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf1.predict(X_holdout)))

F1 score (train) 0.845
F1 score (holdout) 0.731


In [999]:
pred = clf1.predict(X_test)
pd.DataFrame(pred, index=test.index, columns=['target']).to_csv('submisson.csv')

In [1000]:
#KAGGLE 0.79037

In [1001]:
clf2 = RidgeClassifier(random_state=SEED).fit(X_train, y_train)
print('F1 score (train) %.3f' % f1_score(y_train, clf2.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf2.predict(X_holdout)))

F1 score (train) 0.950
F1 score (holdout) 0.741


In [1005]:
params = {'alpha' : [0.001, 0.01, 0.1, 0.5, 1, 2, 3, 5, 7, 10]}
grid = GridSearchCV(RidgeClassifier(random_state=SEED), param_grid=params, 
                    scoring='f1', cv=5).fit(X_train, y_train)
best_clf = grid.best_estimator_
grid.best_params_

{'alpha': 2}

In [1006]:
print('F1 score (train) %.3f' % f1_score(y_train, best_clf.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, best_clf.predict(X_holdout)))

F1 score (train) 0.900
F1 score (holdout) 0.739


**F1 score (train) 0.858 and F1 score (holdout) 0.743**


In [1004]:
pred = best_clf.predict(X_test)
pd.DataFrame(pred, index=test.index, columns=['target']).to_csv('submission_new.csv')