# 1. Import models

In [2]:
# for loading and preprocessing the data
import torch
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

# for training the model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model

# for evaluating classification model
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score,f1_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 2. Load data

In [3]:
import os

for dirname, _, filenames in os.walk('TFIDFInput'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.width',1000000)
pd.set_option('display.max_columns', 500)

score_df = pd.DataFrame(columns={'Model Description','Score'})
# Any results you write to the current directory are saved as output.

test = pd.read_csv("TFIDFInput/test.csv")

TFIDFInput/train_added.csv
TFIDFInput/train.csv
TFIDFInput/sample_submission.csv
TFIDFInput/test.csv


## 2.1 Check Data

In [4]:
print(test.shape)
print(test.isnull().any())

(3263, 4)
id          False
keyword      True
location     True
text        False
dtype: bool


# 3. Training

## 3.3 Processing text

In [5]:
# data prepocessing with regrex

def remove_URL(text): # remove url pattern in text
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

def remove_html(text): # remove html pattern in text
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return html.sub(r'', text)
    #return re.sub(html, '', text)

def remove_punct(text): # remove punctuation in text: (;, ', ", :, ., , etc..)
  table = str.maketrans('', '', string.punctuation)
  return text.translate(table)

In [6]:
def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"i'll", "i will", text)
    text = re.sub(r"she'll", "she will", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"here's", "here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    return text


In [7]:
def massage_text(text):  # 이건 안하는 게 성능 향상에 좋다.
    import re
    from nltk.corpus import stopwords
    ## remove anything other then characters and put everything in lowercase
    tweet = re.sub("[^a-zA-Z]", ' ', text)
    tweet = tweet.lower()
    tweet = tweet.split()
    
    from nltk.stem import WordNetLemmatizer
    lem = WordNetLemmatizer()
    tweet = [lem.lemmatize(word) for word in tweet
             if word not in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    return tweet
    print('--here goes nothing')
    print(text)
    print(tweet)


In [8]:
test['clean_text'] = test['text'].apply(lambda x: remove_URL(x))
test['clean_text'] = test['text'].apply(lambda x: remove_html(x))
test['clean_text'] = test['text'].apply(lambda x: remove_punct(x))
test['clean_text'] = test['text'].apply(clean_text)


## 3.4 Check processed data

In [9]:
test.iloc[-10:-1][['text','clean_text']]

Unnamed: 0,text,clean_text
3253,Malaysian PM confirms debris is from missing f...,malaysian pm confirms debris is from missing f...
3254,Officials: Alabama home quarantined over possi...,officials: alabama home quarantined over possi...
3255,See the 16yr old PKK suicide bomber who detona...,see the 16yr old pkk suicide bomber who detona...
3256,To conference attendees! The blue line from th...,to conference attendees! the blue line from th...
3257,The death toll in a #IS-suicide car bombing on...,the death toll in a #is-suicide car bombing on...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquake safety los angeles ûò safety faste...
3259,Storm in RI worse than last hurricane. My city...,storm in ri worse than last hurricane. my city...
3260,Green Line derailment in Chicago http://t.co/U...,green line derailment in chicago http://t.co/u...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issues hazardous weather outlook (hwo) htt...


# 4. Training with processed data

## 4.1 TFIDF  test

In [10]:
# X_train, X_test, y_train, y_test = train_test_split(test['clean_text'], test['target'], test_size=0.1, random_state=20)

# # Apply Tfidf tranformation except message_text() process
# vector = [
#     TfidfVectorizer(), # F1 score: 0.7584059775840598
#     TfidfVectorizer(max_features = 1000), # F1 score: 0.7444514901712113
#     TfidfVectorizer(max_features = 2000), # F1 score: 0.7542857142857142
#     TfidfVectorizer(max_features = 3000), #  F1 score: 0.7531645569620253
#     TfidfVectorizer(max_features = 10000), # F1 score: 0.7626800250469631 --> best
#     TfidfVectorizer(max_features = 20000) # F1 score: 0.7584059775840598
#     TfidfVectorizer(ngram_range =(1,2)), 
#     TfidfVectorizer(ngram_range =(1,3))
# ]

# # Apply Tfidf tranformation except message_text() process + split tune
# # split(test_size=0.2, stratify=train['target'], random_state=20)
# vector = [
#     TfidfVectorizer(), # F1 score: 0.7348912167606769
#     TfidfVectorizer(max_features = 1000), # F1 score: 0.7235099337748343
#     TfidfVectorizer(max_features = 2000), # F1 score: 0.7358024691358025
#     TfidfVectorizer(max_features = 3000), #  F1 score: 0.734860883797054
#     TfidfVectorizer(max_features = 10000), # F1 score: 0.7408013082583811
#     TfidfVectorizer(max_features = 20000) # F1 score: 0.7354838709677419
# ]


# # Apply Tfidf tranformation except message_text() process + split tune
# # split(test_size=0.2, random_state=20)
# vector = [
#     TfidfVectorizer(), # F1 score: F1 score: 0.7630331753554502   --> best
#     TfidfVectorizer(max_features = 20000) # F1 score: 0.7630331753554502 --> best
# ]

# Apply Tfidf tranformation except message_text() process + split tune
# split(test_size=0.1, random_state=20)
# vector = [
#     TfidfVectorizer(), # F1 score: F1 score: 0.7682737169517885  
#     TfidfVectorizer(max_features = 20000), # F1 score: 0.7701863354037266 --> best
#     TfidfVectorizer(max_features = 30000)  # F1 score: 0.7682737169517885
# ]

# # Apply Tfidf tranformation except message_text() process + split tune
# # split(test_size=0.15, random_state=20)
# vector = [
#     TfidfVectorizer(), # F1 score: 0.7643979057591623
#     TfidfVectorizer(max_features = 10000), # F1 score: 0.7660020986358866
#     TfidfVectorizer(max_features = 20000), # F1 score: 0.7635983263598327
#     TfidfVectorizer(max_features = 30000)  # F1 score: 0.7643979057591623
# ]

# # Apply Tfidf tranformation except message_text() process + split tune
# # split(test_size=0.05, random_state=20)
# vector = [
#     TfidfVectorizer(), # F1 score: F1 score: 0.7658227848101266
# ]

# # Apply Tfidf tranformation except message_text(), clean_text() process
# vector = [
#     TfidfVectorizer(), # F1 score: 0.7615433270082226
#     TfidfVectorizer(max_features = 1000), # F1 score: 0.7399872854418309
#     TfidfVectorizer(max_features = 2000), # F1 score: 0.7523629489603024
#     TfidfVectorizer(max_features = 3000), #  F1 score: 0.7504714016341923
#     TfidfVectorizer(max_features = 10000) # F1 score: 0.7574039067422811
# ]

# X_train_vector = vector.transform(X_train)
# X_test_vector  = vector.transform(X_test)


# 4.2 Model test

In [24]:
## pre processed data for all method
# MLA = [
#     linear_model.RidgeClassifierCV(alphas=[1e-3]),     # F1 score: 0.7113702623906706
#     linear_model.RidgeClassifierCV(alphas=[1e-2]),     # F1 score: 0.7109283196239718
#     linear_model.RidgeClassifierCV(alphas=[1e-1]),     # F1 score: 0.7368421052631579
#     linear_model.RidgeClassifierCV(alphas=[1]),        # F1 score: 0.741738066095471
#     linear_model.RidgeClassifierCV(normalize=True),    # F1 score: 0.7209154481881755
#     linear_model.RidgeClassifierCV(cv=5),              # F1 score: 0.7417380660954712
#     linear_model.RidgeClassifierCV(alphas=[1], cv=5),  # F1 score: 0.7417380660954712
#     linear_model.RidgeClassifierCV(alphas=[1], normalize=True),  #F1 score: 0.7209154481881755
#     linear_model.RidgeClassifierCV(alphas=[1], cv=10), # F1 score: 0.7417380660954712
#     linear_model.RidgeClassifierCV(alphas=[10], cv=5) # F1 score: 0.707057256990679
# ]

## pre processed data except message_text()
# MLA = [
#     linear_model.RidgeClassifierCV(alphas=[1e-3]),     # F1 score: 0.7272727272727273
#     linear_model.RidgeClassifierCV(alphas=[1e-2]),     # F1 score: 0.7326615293420273
#     linear_model.RidgeClassifierCV(alphas=[1e-1]),     # F1 score: 0.7485029940119761
#     linear_model.RidgeClassifierCV(alphas=[1])        # F1 score: 0.7584059775840598 -> best
#     linear_model.RidgeClassifierCV(normalize=True),    # F1 score: 0.7189119170984455
#     linear_model.RidgeClassifierCV(alphas=[1], cv=5),  # F1 score: 0.7584059775840598
#     linear_model.RidgeClassifierCV(alphas=[1], normalize=True),  # 0.7189119170984455
#     linear_model.RidgeClassifierCV(alphas=[10], cv=5) # F1 score: 0.7264276228419655
# ]


In [45]:
X_test = test['clean_text']

import joblib
loaded_vector = joblib.load(open("./weights/vector002-2021-11-13-09-F1-077018.pkl", "rb"))

# import pickle
# loaded_model = pickle.load(open("./weights/model002-2021-11-13-08-F1-077018.pkl"))
loaded_model = pickle.load(open("./weights/model002-2021-11-13-09-F1-077018.pkl", "rb"))

vec = loaded_vector
X_test_vector  = vec.transform(X_test)

# print(X_test_vector)

# result = loaded_model.predict(X_test_vector)
res = loaded_model.predict(X_test_vector.todense())

df = pd.DataFrame(test['id'])
df['target'] = res
df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [48]:
df.to_csv('Disaster_tweet_submission.csv', index=False)

check = pd.read_csv('Disaster_tweet_submission.csv')

check.head()

check.target.value_counts()

0    2031
1    1232
Name: target, dtype: int64