In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
import re                  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import f1_score
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/vlad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
import string

***1*** if the tweet is describing a real disaster, and ***0*** otherwise

In [4]:
train = pd.read_csv('../../data/real_train.csv').set_index('id')
test = pd.read_csv('../../data/real_test.csv').set_index('id')

In [5]:
y = train['target']
train = train.drop(['target'], axis=1)

In [6]:
df = pd.concat([train, test])
df = df.drop(['location', 'keyword'], axis=1)
split = train.shape[0]

In [7]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)


df['text']=df['text'].apply(lambda x : remove_URL(x))

In [8]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

df['text']=df['text'].apply(lambda x : remove_html(x))

In [9]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['text']=df['text'].apply(lambda x: remove_emoji(x))

In [10]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

df['text']=df['text'].apply(lambda x : remove_punct(x))

In [11]:
stemmer = SnowballStemmer("english")
df['text']=df['text'].apply(lambda x : stemmer.stem(x))

`max_df` is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

>`max_df` = 0.50 means "ignore terms that appear in more than 50% of the documents".
>`max_df` = 25 means "ignore terms that appear in more than 25 documents".

The default `max_df` is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

`min_df` is used for removing terms that appear too infrequently. For example:

>`min_df` = 0.01 means "ignore terms that appear in less than 1% of the documents".
>`min_df` = 5 means "ignore terms that appear in less than 5 documents".

The default `min_df` is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms

In [12]:
stop_words = set(stopwords.words("english"))


In [13]:
vectorizer = TfidfVectorizer(analyzer='char',
                            stop_words='english',
                            ngram_range=(1, 6),
                            min_df=10, max_df=0.9)

In [14]:
df_title = vectorizer.fit_transform(df['text'])

In [15]:
X, X_test = df_title[:split], df_title[split:]
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, train_size=0.7)

In [16]:
clf = RidgeClassifier(alpha=1, random_state=666).fit(X_train, y_train)

In [17]:
print('F1 score (train) %.3f' % f1_score(y_train, clf.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, clf.predict(X_holdout)))

F1 score (train) 0.943
F1 score (holdout) 0.736


- Cleaned data
- F1 score (train) 0.917
- F1 score (holdout) 0.751
- TfidfVectorizer(analyzer='char',stop_words='english', ngram_range=(1, 4), min_df=3, max_df=0.3)
- Ridge, alpha=1.5
- Results:
  1. F1 score (train) 0.917
  2. F1 score (holdout) 0.751
  3. KAggle 0.79957

In [18]:
# n_range = [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
# min_df = [5, 10, 20]
# max_df = [0.1, 0.2, 0.3, 0.4]
# f1_results = {}
# for i in tqdm(n_range):
#     for j in min_df:
#         for k in max_df:
#             test_df = df
#             vectorizer = TfidfVectorizer(analyzer='char',
#                             stop_words='english',
#                             ngram_range=i,
#                             min_df=j,
#                             max_df=k)
#             df_title  = vectorizer.fit_transform(test_df['text'])
#             X, X_test = df_title[:split], df_title[split:]
#             X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, train_size=0.7, random_state=666)
#             clf = RidgeClassifier(random_state=666).fit(X_train, y_train)
#             f1_results['n_range=' + str(i) + 
#                        ' min_df=' + str(j) + 
#                        ' max_df=' + str(k)] = f1_score(y_holdout, clf.predict(X_holdout))


In [19]:
# pd.DataFrame.from_dict(f1_results, orient='index').rename(columns={0: 'F1'}).sort_values(by='F1', ascending=False)

In [20]:
vectorizer_mod = TfidfVectorizer(analyzer='char',
                            stop_words='english',
                            ngram_range=(3, 4),
                            min_df=5, max_df=0.4)

In [21]:
params = {'alpha': [0.1, 0.5, 1, 2, 5, 10, 15, 20, 30]}

In [22]:
df_title = vectorizer_mod.fit_transform(df['text'])

In [23]:
X, X_test = df_title[:split], df_title[split:]
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, train_size=0.7, random_state=666)

In [24]:
grid = GridSearchCV(RidgeClassifier(random_state=666), param_grid=params, scoring='f1', cv=5).fit(X_train, y_train)
best_clf = grid.best_estimator_

In [25]:
grid.best_params_

{'alpha': 2}

In [26]:
print('F1 score (train) %.3f' % f1_score(y_train, best_clf.predict(X_train)))
print('F1 score (holdout) %.3f' % f1_score(y_holdout, best_clf.predict(X_holdout)))

F1 score (train) 0.896
F1 score (holdout) 0.765


In [27]:
pred = best_clf.predict(X_test)

In [28]:
pd.DataFrame(pred, index=test.index, columns=['target']).to_csv('submission.csv')

- TfidfVectorizer(analyzer='char',stop_words='english', ngram_range=(3, 4),minn_df=5, max_df=0.4)
- RidgeClassifier
- {'alpha': 2}
- F1 score (train) 0.896
- F1 score (holdout) 0.765
- Kaggle 0.80570