In [1]:
import pandas as pd
import numpy as np

import re
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [8]:
train_data['keyword'].fillna(method = 'backfill').unique()

array(['ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'derailed

In [5]:
train_data['keyword'].

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [28]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [29]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [30]:
train_data.dropna().head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [31]:
train_data['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [32]:
null_values = train_data.isnull().sum()
for index in range(len(train_data.columns)):
    if null_values[index] > 0:
        print('{:.2f}% ({}) Null Values Present in "{}" Feature'.format(null_values[index]/len(train_data)*100,
                                                              null_values[index], train_data.columns[index]))

0.80% (61) Null Values Present in "keyword" Feature
33.27% (2533) Null Values Present in "location" Feature


In [33]:
train_data['keyword'].fillna(method = 'backfill', inplace = True)
train_data['keyword'].fillna(method = 'ffill', inplace = True)
test_data['keyword'].fillna(method = 'backfill', inplace = True)
test_data['keyword'].fillna(method = 'ffill', inplace = True)

In [34]:
test_data['keyword'].value_counts()

ablaze                   29
deluged                  23
rubble                   22
demolished               22
wrecked                  22
                         ..
fatalities                5
radiation%20emergency     5
threat                    5
inundation                4
epicentre                 1
Name: keyword, Length: 221, dtype: int64

In [35]:
def decontration(text):
    text = re.sub(r"aren't", 'are not', text)
    text = re.sub(r"won't", 'will not', text)
    text = re.sub(r"doesn't", 'does not', text)
    
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text.lower()

In [36]:
def cleaning_text(text):
    text = re.sub(r'http\S+', ' ', text)
    text = decontration(text)
    text  = re.sub('[^A-Za-z,0123]+', ' ', text)
    
    stop_words = set(stopwords.words('english'))
    processed_list = [word for word in text.split() if word not in stop_words and len(word) > 2]
    return " ".join(processed_list)

In [37]:
preprocessed_text = []
for text in tqdm(train_data['text']):
    preprocessed_text.append(cleaning_text(text))
train_data['text'] = preprocessed_text

preprocessed_text = []
for text in tqdm(test_data['text']):
    preprocessed_text.append(cleaning_text(text))
test_data['text'] = preprocessed_text

100%|████████████████████████████████████████████████████████████████████████████| 7613/7613 [00:02<00:00, 3324.34it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3263/3263 [00:00<00:00, 3379.57it/s]


In [38]:
train_data['keyword']

0        ablaze
1        ablaze
2        ablaze
3        ablaze
4        ablaze
         ...   
7608    wrecked
7609    wrecked
7610    wrecked
7611    wrecked
7612    wrecked
Name: keyword, Length: 7613, dtype: object

In [39]:
X = train_data[['keyword', 'text']]
y = train_data['target']
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [40]:
X_train

Unnamed: 0,keyword,text
3750,fire,put flood fire top list
3627,fatalities,rejected mortal kombat fatalities mortal komba...
2271,demolish,enugu government demolish illegal structures i...
5717,rescuers,woman gps app guides rescuers injured biker ma...
1781,crash,bought another meinlcymbals medium crash hey m...
...,...,...
6490,sunk,lip sunk bed arms crossed behind head watched ...
7220,weapons,sorry resort weapons settle something pussy wi...
5586,razed,news latest homes razed northern california wi...
7383,windstorm,like weird ones like rain mystical windstorm o...


In [41]:
vectorizer = CountVectorizer(min_df=3)
X_train_text = vectorizer.fit_transform(X_train['text'])
X_test_text = vectorizer.transform(X_test['text'])
test_text = vectorizer.transform(test_data['text']) 

print("Vectorized Training Text Data Shape    : ", X_train_text.shape)
print("Vectorized Testing Text Data Shape     : ", X_test_text.shape)
print("Vectorized Real Testing Text Shape     : ", test_text.shape)

Vectorized Training Text Data Shape    :  (6090, 3463)
Vectorized Testing Text Data Shape     :  (1523, 3463)
Vectorized Real Testing Text Shape     :  (3263, 3463)


In [42]:
vectorizer = CountVectorizer()
X_train_keyword = vectorizer.fit_transform(X_train['keyword'])
X_test_keyword = vectorizer.transform(X_test['keyword']) 
test_keyword = vectorizer.transform(test_data['keyword']) 

print("Vectorized Training Data Shape    : ", X_train_keyword.shape)
print("Vectorized Testing Data Shape     : ", X_test_keyword.shape)
print("Vectorized Testing Data Shape     : ", test_keyword.shape)

Vectorized Training Data Shape    :  (6090, 239)
Vectorized Testing Data Shape     :  (1523, 239)
Vectorized Testing Data Shape     :  (3263, 239)


In [43]:
print(X_train_keyword)

  (0, 131)	1
  (1, 128)	1
  (2, 91)	1
  (3, 186)	1
  (4, 77)	1
  (5, 111)	1
  (6, 210)	1
  (7, 151)	1
  (8, 124)	1
  (9, 191)	1
  (10, 100)	1
  (11, 30)	1
  (12, 233)	1
  (13, 105)	1
  (14, 228)	1
  (15, 195)	1
  (16, 233)	1
  (17, 233)	1
  (18, 168)	1
  (18, 21)	1
  (19, 143)	1
  (20, 147)	1
  (21, 79)	1
  (22, 121)	1
  (23, 161)	1
  :	:
  (6068, 57)	1
  (6068, 2)	1
  (6069, 169)	1
  (6070, 111)	1
  (6071, 64)	1
  (6072, 100)	1
  (6073, 89)	1
  (6074, 114)	1
  (6075, 189)	1
  (6076, 225)	1
  (6076, 25)	1
  (6077, 67)	1
  (6078, 234)	1
  (6079, 168)	1
  (6079, 11)	1
  (6080, 183)	1
  (6081, 169)	1
  (6082, 34)	1
  (6083, 178)	1
  (6084, 197)	1
  (6085, 206)	1
  (6086, 229)	1
  (6087, 182)	1
  (6088, 233)	1
  (6089, 136)	1


In [44]:
X_train_final = np.hstack((X_train_text.toarray(), X_train_keyword.toarray()))
X_test_final = np.hstack((X_test_text.toarray(), X_test_keyword.toarray()))
testing_data = np.hstack((test_text.toarray(), test_keyword.toarray()))

In [50]:
X_test_final.shape

(1523, 3702)

In [74]:
parameters = {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
sgd_clf = SGDClassifier(class_weight='balanced', penalty='l2', loss='log', random_state=910)
clf = GridSearchCV(sgd_clf, parameters, n_jobs = -1, cv = 5, scoring = make_scorer(f1_score))
clf.fit(X_train_final, y_train)
clf.best_params_



{'alpha': 0.001}

In [75]:
sgd_clf = SGDClassifier(alpha = 0.001, class_weight='balanced', penalty='l2', loss='log', random_state=910)
sgd_clf.fit(X_train_final, y_train)



In [76]:
train_preds = sgd_clf.predict(X_train_final)
test_preds = sgd_clf.predict(X_test_final)

print("Train Score ", f1_score(y_train, train_preds))
print('Test Score ', f1_score(y_test, test_preds))

Train Score  0.8423722200546235
Test Score  0.7824109173616376


In [77]:
submission_file = pd.DataFrame({'id':test_data['id'], 'target':sgd_clf.predict(testing_data)})
submission_file.to_csv("submission_file.csv", index=False)