In [3]:
import pandas as pd
import numpy as np

import spacy
nlp = spacy.load('en_core_web_lg')
import re
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier,XGBRFClassifier

In [4]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [5]:
train_data.dropna().head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [6]:
null_values = train_data.isnull().sum()
for index in range(len(train_data.columns)):
    if null_values[index] > 0:
        print('{:.2f}% ({}) Null Values Present in "{}" Feature'.format(null_values[index]/len(train_data)*100,
                                                              null_values[index], train_data.columns[index]))

0.80% (61) Null Values Present in "keyword" Feature
33.27% (2533) Null Values Present in "location" Feature


In [7]:
train_data['keyword'].fillna(method = 'backfill', inplace = True)
train_data['keyword'].fillna(method = 'ffill', inplace = True)
test_data['keyword'].fillna(method = 'backfill', inplace = True)
test_data['keyword'].fillna(method = 'ffill', inplace = True)

In [8]:
def decontration(text):
    text = re.sub(r"aren't", 'are not', text)
    text = re.sub(r"won't", 'will not', text)
    text = re.sub(r"doesn't", 'does not', text)
    
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text.lower()

In [10]:
def cleaning_text(text):
    text = re.sub(r'http\S+', '', text)
    text = decontration(text)
    text  = re.sub('[^A-Za-z\s]+', '', text)
    
    stop_words = set(stopwords.words('english'))
    processed_list = [word for word in text.split() if word not in stop_words and len(word) > 2]
    res = nlp(" ".join(processed_list))
    q = []
    for token in res:
        q.append(token.lemma_.lower())
    return ' '.join(q)

In [11]:
preprocessed_text = []
for text in tqdm(train_data['text']):
    preprocessed_text.append(cleaning_text(text))
train_data['text'] = preprocessed_text

preprocessed_text = []
for text in tqdm(test_data['text']):
    preprocessed_text.append(cleaning_text(text))
test_data['text'] = preprocessed_text

100%|█████████████████████████████████████████████████████████████████████████████| 7613/7613 [00:58<00:00, 131.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:25<00:00, 126.90it/s]


In [12]:
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,ablaze,,deed reason earthquake may allah forgive,1
1,4,ablaze,,forest fire near ronge sask canada,1
2,5,ablaze,,resident ask ishelter place notify officer eva...,1
3,6,ablaze,,people receive wildfire evacuation order calif...,1
4,7,ablaze,,got send photo ruby alaska smoke wildfire pour...,1
...,...,...,...,...,...
7608,10869,wrecked,,two giant crane hold bridge collapse nearby home,1
7609,10870,wrecked,,ariaahrary thetawniest control wild fire calif...,1
7610,10871,wrecked,,utckm volcano hawaii,1
7611,10872,wrecked,,police investigate ebike collide car little po...,1


In [13]:
X = train_data[['keyword','text']]
y = train_data['target']
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [186]:
# vectorizer = CountVectorizer()
# X_train_text = vectorizer.fit_transform(X_train)
# X_test_text = vectorizer.transform(X_test)
# test_text = vectorizer.transform(test_data['text']) 

# print("Vectorized Training Text Data Shape    : ", X_train_text.shape)
# print("Vectorized Testing Text Data Shape     : ", X_test_text.shape)
# print("Vectorized Real Testing Text Shape     : ", test_text.shape)

In [187]:
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_text)
# X_train_tfidf.shape

In [188]:
# X_test_tfidf = tfidf_transformer.transform(test_text)
# X_test_tfidf.shape

In [14]:
vector = TfidfVectorizer()
X_train_text = vector.fit_transform(X_train['text'])
X_train_text.shape

(6090, 12444)

In [15]:
print(X_train_text)

  (0, 8667)	0.39464058355127507
  (0, 1293)	0.25445639210153004
  (0, 715)	0.3089955241472506
  (0, 12278)	0.4078339249993441
  (0, 5694)	0.31132912387559564
  (0, 8449)	0.33894761715159555
  (0, 3959)	0.28847607882002063
  (0, 3103)	0.4078339249993441
  (0, 7525)	0.2373725785172763
  (1, 3337)	0.2856517806432486
  (1, 11636)	0.19865501840724725
  (1, 3795)	0.2606608021498507
  (1, 10417)	0.21381598473746977
  (1, 2019)	0.24048421265617012
  (1, 1177)	0.2922252678224647
  (1, 3282)	0.3106427591366465
  (1, 1196)	0.2637979417242657
  (1, 871)	0.3106427591366465
  (1, 6706)	0.3106427591366465
  (1, 1056)	0.3106427591366465
  (1, 3004)	0.17824853848067457
  (1, 4044)	0.36700416936999625
  (2, 4511)	0.2305319690345914
  (2, 8563)	0.30209053677758224
  (2, 4816)	0.31676619785168864
  :	:
  (6087, 4422)	0.5176816948017814
  (6087, 4400)	0.24406621799983536
  (6087, 1293)	0.2912294961863607
  (6088, 5700)	0.3966909527250946
  (6088, 8462)	0.3966909527250946
  (6088, 3303)	0.3788617935206132
 

In [16]:
X_test_text = vector.transform(X_test['text'])
X_test_text.shape

(1523, 12444)

In [17]:
X_testdata_text = vector.transform(test_data['text'])
X_testdata_text.shape

(3263, 12444)

In [18]:
X_train_kw = vector.fit_transform(X_train['keyword'])
X_train_kw.shape

(6090, 239)

In [19]:
X_test_kw = vector.transform(X_test['keyword'])
X_test_kw.shape

(1523, 239)

In [20]:
X_testdata_kw = vector.transform(test_data['keyword'])
X_testdata_kw.shape

(3263, 239)

In [21]:
X_train_final = np.hstack((X_train_text.toarray(), X_train_kw.toarray()))
X_test_final = np.hstack((X_test_text.toarray(),X_test_kw.toarray()))
testing_data = np.hstack((X_testdata_text.toarray(),X_testdata_kw.toarray()))

In [22]:
X_train_final

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
# X_train_final

In [23]:
parameters = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
sgd_clf = SGDClassifier(class_weight='balanced', penalty='l2', loss='log', random_state=910)
clf = GridSearchCV(sgd_clf, parameters, n_jobs = -1, cv = 5, scoring = make_scorer(f1_score))
clf.fit(X_train_final, y_train)
clf.best_params_



{'alpha': 0.0001}

In [24]:
sgd_clf = SGDClassifier(alpha = 0.001, class_weight='balanced', penalty='l2', loss='log', random_state=910)
sgd_clf.fit(X_train_final, y_train)



In [25]:
# train_preds = sgd_clf.predict(X_train_tfidf)
test_preds = sgd_clf.predict(X_test_final)

# print("Train Score ", f1_score(y_train, train_preds))
print('Test Score ', f1_score(y_test, test_preds))

Test Score  0.7626004382761139


In [26]:
submission_file = pd.DataFrame({'id':test_data['id'], 'target':sgd_clf.predict(testing_data)})
submission_file.to_csv("submission_file.csv", index=False)

In [215]:
Gnb_clf = GaussianNB()
Gnb_clf.fit(X_train_final, y_train)
test_preds_gnb = Gnb_clf.predict(X_test_final)

In [216]:
print('Test Score ', f1_score(y_test, test_preds_gnb))

Test Score  0.6165413533834586


In [217]:
submission_file = pd.DataFrame({'id':test_data['id'], 'target':Gnb_clf.predict(testing_data)})
submission_file.to_csv("submission_filegnb.csv", index=False)

In [27]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_final, y_train)
test_preds_lr = lr_clf.predict(X_test_final)
print('Test Score ', f1_score(y_test, test_preds_lr))

Test Score  0.7664


In [219]:
submission_file = pd.DataFrame({'id':test_data['id'], 'target':lr_clf.predict(testing_data)})
submission_file.to_csv("submission_filelr.csv", index=False)

In [28]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_final, y_train)
test_preds_dt = dt_clf.predict(X_test_final)
print('Test Score ', f1_score(y_test, test_preds_dt))

Test Score  0.6988505747126438


In [29]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train_final, y_train)
test_preds_rf = rf_clf.predict(X_test_final)
print('Test Score ', f1_score(y_test, test_preds_rf))

Test Score  0.7283236994219653


In [223]:
submission_file = pd.DataFrame({'id':test_data['id'], 'target':rf_clf.predict(testing_data)})
submission_file.to_csv("submission_filerf.csv", index=False)

In [30]:
gd_clf = GradientBoostingClassifier()
gd_clf.fit(X_train_final, y_train)
test_preds_gd = gd_clf.predict(X_test_final)
print('Test Score ', f1_score(y_test, test_preds_gd))

Test Score  0.611764705882353


In [31]:
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train_final, y_train)
test_preds_ada = ada_clf.predict(X_test_final)
print('Test Score ', f1_score(y_test, test_preds_ada))

Test Score  0.6625441696113074


In [32]:
xg_clf = XGBClassifier()
xg_clf.fit(X_train_final, y_train)
test_preds_xg = xg_clf.predict(X_test_final)
print('Test Score ', f1_score(y_test, test_preds_xg))

Test Score  0.6901408450704226


In [33]:
xgrf_clf = XGBRFClassifier()
xgrf_clf.fit(X_train_final, y_train)
test_preds_xgrf = xgrf_clf.predict(X_test_final)
print('Test Score ', f1_score(y_test, test_preds_xgrf))

Test Score  0.33414634146341465


In [34]:
parameters = {
    'penalty' : ['l1','l2'], 
    'C'       : np.logspace(-3,3,7)
}

In [36]:
logreg = LogisticRegression()
clflr = GridSearchCV(logreg,                    # model
                   param_grid = parameters,   # hyperparameters
                   scoring='f1',        # metric for scoring
                   cv=10)            

In [37]:
clflr.fit(X_train_final, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [38]:
print("Tuned Hyperparameters :", clflr.best_params_)
print("f1_score :",clflr.best_score_)

Tuned Hyperparameters : {'C': 1.0, 'penalty': 'l2'}
f1_score : 0.7374336728789721


In [39]:
submission_file = pd.DataFrame({'id':test_data['id'], 'target':clflr.predict(testing_data)})
submission_file.to_csv("submission_filegslr.csv", index=False)

In [265]:
param_grid = {
              'n_estimators':[90,100],
              'criterion':['gini','entropy'],
              'max_depth': [3,5,7],
              'min_samples_leaf': [3,5,7],
              'min_samples_split': [3,5,7],
              'max_features':['auto']
}

In [266]:
random = RandomForestClassifier()
clfrf = GridSearchCV(random,                    # model
                   param_grid = param_grid,   # hyperparameters
                           # metric for scoring
                   cv=10,
                    n_jobs=-1,
                    verbose = 2)            

In [267]:
clfrf.fit(X_train_final, y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


  warn(


In [268]:
print("Tuned Hyperparameters :", clfrf.best_params_)
print("f1_score :",clfrf.best_score_)

Tuned Hyperparameters : {'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 90}
f1_score : 0.22114941551970607
