# NLP with Disaster tweets


## 1. Import libraries

In [None]:
import re
import string
import nltk
import pandas as pd
import numpy as np

from typing import List
from collections import Counter
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

assert nltk.download("stopwords")

## 2. Exploration analysis


In [2]:
data = pd.read_csv('train.csv')

print(data.shape)
data.head()

(7613, 5)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
data["len"] = data["text"].apply(len)
data.head()

Unnamed: 0,id,keyword,location,text,target,len
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,69
1,4,,,Forest fire near La Ronge Sask. Canada,1,38
2,5,,,All residents asked to 'shelter in place' are ...,1,133
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88


## 3. Change our datatype to numpy

In [4]:
text = data.text.values

assert len(text) == data.shape[0]
text[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

## 4. Clean data from trash symbols

In [5]:
def text_clean(text: str) -> List[str]:  # https://docs.python.org/3/library/typing.html
    text_cleaned = re.sub("http:www.*.", "", text)
    text_cleaned = re.sub(f"[{string.punctuation}]","",text_cleaned)
    text_cleaned = re.sub('  ',"",text_cleaned)
    text_cleaned = re.sub('Im',"",text_cleaned)
    
    return text_cleaned

In [6]:
text_cleaned = list(map(text_clean, text))
text[0], text_cleaned[0]

('Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Our Deeds are the Reason of this earthquake May ALLAH Forgive us all')

## 5. Tokenization and removing stop words of the data

In [7]:
text_tok = list(map(nltk.word_tokenize, text_cleaned))

assert " ".join(text_tok[0]) == text_cleaned[0]
text_tok[0]

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 'earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all']

In [8]:
def remove_stopwords(text: str) -> List[str]:
    return [w.lower() for w in text if w.lower() not in nltk.corpus.stopwords.words("english")]

In [9]:
X = list(map(remove_stopwords, text_tok))
text[0], text_cleaned[0], X[0]

('Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
 'Our Deeds are the Reason of this earthquake May ALLAH Forgive us all',
 ['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us'])

In [10]:
y = data.target.values
y[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [11]:
 sum(len(sent) for sent in X)

74777

## 6. Count values

In [12]:
all_words = sum(X, [])
frequent_counter = Counter(all_words)
frequent_dict = dict(sorted(frequent_counter.items(), key=lambda item: -item[1]))

pd.DataFrame(frequent_dict, index=[0]).T.head(10)

Unnamed: 0,0
like,342
amp,297
fire,242
get,227
new,217
via,215
people,193
one,189
dont,184
news,175


## 7. Join words and make a ready dataset

In [13]:
clean_texts = list(map(lambda x: " ".join(x), X))
clean_texts[0]

'deeds reason earthquake may allah forgive us'

In [14]:
clean_df = pd.DataFrame({"text": clean_texts,
                         "target": y})
assert clean_df.shape[0] == data.shape[0]
clean_df.head(5)

Unnamed: 0,text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,13000 people receive wildfires evacuation orde...,1
4,got sent photo ruby alaska smoke wildfires pou...,1


## 8. Split the data

In [15]:
train, test = train_test_split(
    clean_df,
    stratify=clean_df.target,
    test_size=.2,
    random_state=7,
    shuffle=True,
)
train.shape[0] / clean_df.shape[0]

0.7999474582950217

## 9. Vectorizing and modeling data

In [16]:
word_v = TfidfVectorizer()
word_v.fit(clean_df.text)
train_word_features = word_v.transform(train.text)
test_word_features = word_v.transform(test.text)


In [17]:
classifier = LogisticRegression()
classifier.fit(train_word_features, train.target)

submission = classifier.predict(test_word_features)

In [18]:
submission

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

## 10. First cross-validation

In [19]:
cross_validate(classifier,test_word_features , test.target)['test_score'].mean()

0.7183218291630716

## 11. Parameters selection

In [113]:
parameters = {'C' : [1.0, 2.0, 3.0, 4.0],
             'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
logreg_cv=GridSearchCV(classifier, param_grid=parameters, cv=10) 



In [114]:
logreg_cv.fit(train_word_features, train.target)

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 2.0, 3.0, 4.0],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']})

In [115]:
logreg_cv.best_params_

{'C': 2.0, 'solver': 'lbfgs'}

## 12. New validation score of the train data

In [116]:
logreg_cv.best_score_

0.7949096880131362

## 13. Prediction and cross_validation of the end data

In [118]:
cross_validate(logreg_cv, test_word_features , test.target)['test_score'].mean()

0.7537683347713546

### As we see, the model was improved my parameters selection: from 71,8% to 75,4% of cross-validation. Here are the results of prediction:

In [122]:
clf=logreg_cv.predict(test_word_features)

### Thank you for watching!