# Natural Language Processing with Disaster Tweets

## 1. 라이브러리 임포트

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
stop=set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## 2. 데이터 로드

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
tweet= pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')
test=pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')
tweet.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [7]:
print('Train - Rows : {}, Columns : {}'.format(tweet.shape[0],tweet.shape[1]))
print('Test - Rows : {}, Columns : {}'.format(test.shape[0],test.shape[1]))

Train - Rows : 7613, Columns : 5
Test - Rows : 3263, Columns : 4


## 3. 텍스트 데이터 클렌징

In [8]:
df=pd.concat([tweet,test])
df.shape

(10876, 5)

### 3.1 URL 제거

In [9]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [10]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

### 3.2 HTML 태그 제거

In [11]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

In [12]:
df['text']=df['text'].apply(lambda x : remove_html(x))

### 3.3 이모티콘 제거

In [13]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [14]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))

### 3.4 구둣점 제거

In [15]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [16]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

## 4. 텍스트 임베딩 : GloVe pretrained corpus model 활용

In [17]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus       

In [18]:
corpus=create_corpus(df)

100%|██████████| 10876/10876 [00:01<00:00, 7416.54it/s]


In [19]:
embedding_dict={}
with open('/content/drive/My Drive/Colab Notebooks/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [20]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [21]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 20342


In [22]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
            

100%|██████████| 20342/20342 [00:00<00:00, 482950.96it/s]


## 5. 딥러닝 모델 수행

In [24]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                    input_length=MAX_LEN,trainable=False)
# 임베딩 레이어 추가
model.add(embedding)
# Dropout 정규화 적용
model.add(SpatialDropout1D(0.2))
# 64개의 내부 유닛으로 구성된 LSTM 레이어 추가
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
# 검증된 Sigmoid 활성함수 사용
model.add(Dense(1, activation='sigmoid'))
# 검증된 Adam 최적화함수 사용
optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [25]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2034300   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,076,605
Trainable params: 42,305
Non-trainable params: 2,034,300
_________________________________________________________________


In [26]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

In [29]:
# Training 데이터를 분할해 15%의 데이터를 Validation 데이터로 활용
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train', X_train.shape)
print("Shape of validation ", X_test.shape)

Shape of train (6471, 50)
Shape of validation  (1142, 50)


In [30]:
# 적절한 epoch 지정
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

Epoch 1/15
1618/1618 - 104s - loss: 0.6910 - accuracy: 0.5726 - val_loss: 0.6885 - val_accuracy: 0.5569
Epoch 2/15
1618/1618 - 77s - loss: 0.6195 - accuracy: 0.6645 - val_loss: 0.5604 - val_accuracy: 0.7671
Epoch 3/15
1618/1618 - 76s - loss: 0.5445 - accuracy: 0.7480 - val_loss: 0.5342 - val_accuracy: 0.7688
Epoch 4/15
1618/1618 - 76s - loss: 0.5242 - accuracy: 0.7551 - val_loss: 0.5183 - val_accuracy: 0.7715
Epoch 5/15
1618/1618 - 77s - loss: 0.5142 - accuracy: 0.7622 - val_loss: 0.5074 - val_accuracy: 0.7758
Epoch 6/15
1618/1618 - 78s - loss: 0.5036 - accuracy: 0.7710 - val_loss: 0.4994 - val_accuracy: 0.7793
Epoch 7/15
1618/1618 - 77s - loss: 0.4993 - accuracy: 0.7756 - val_loss: 0.4942 - val_accuracy: 0.7837
Epoch 8/15
1618/1618 - 77s - loss: 0.4928 - accuracy: 0.7772 - val_loss: 0.4925 - val_accuracy: 0.7785
Epoch 9/15
1618/1618 - 75s - loss: 0.4895 - accuracy: 0.7812 - val_loss: 0.4882 - val_accuracy: 0.7828
Epoch 10/15
1618/1618 - 76s - loss: 0.4828 - accuracy: 0.7830 - val_loss

## (참고) Kaggle 제출용 코드

In [31]:
# sample_sub=pd.read_csv('/content/drive/My Drive/Colab Notebooks/sample_submission.csv')

In [32]:
# y_pre=model.predict(test)
# y_pre=np.round(y_pre).astype(int).reshape(3263)
# sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
# sub.to_csv('submission.csv',index=False)

In [33]:
# sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
