In [51]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/linqisheng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/linqisheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/linqisheng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
disaster = pd.read_csv('nlp-getting-started/train.csv')

In [4]:
disaster.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


> ## add features


> ### cleaning

In [5]:
text_data = np.array(disaster[['id','text']])

In [6]:
lemmatizer = WordNetLemmatizer()
analyser = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english')) 

In [7]:
text_data[0:5,:]

array([[1,
        'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
       [4, 'Forest fire near La Ronge Sask. Canada'],
       [5,
        "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
       [6,
        '13,000 people receive #wildfires evacuation orders in California '],
       [7,
        'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ']],
      dtype=object)

In [8]:
# Removes words that are not needed. Lemmaizes words
def clean_text(arr):
    cleaned_texts = []
    for row in arr:
        text = "".join((char for char in row[1] if char not in string.punctuation))
        text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'^http?:\/\/.*[\r\n]*', '', row[1], flags=re.MULTILINE)
        text = word_tokenize(text)
        text = [lemmatizer.lemmatize(w.lower()) for w in text if not w in stop_words] 
        cleaned_texts.append(text)
        
    return np.array(cleaned_texts)

In [9]:
text_clean = clean_text(text_data)

In [10]:
disaster['clean_text'] = text_clean

> ### sentiment analysis

In [11]:
# Obtains sentiment score
def sentiment(arr):
    scores = []
    for row in arr:
        text = re.sub(r'^https?:\/\/.*[\r\n]*', '', row[1], flags=re.MULTILINE)
        text = re.sub(r'^http?:\/\/.*[\r\n]*', '', row[1], flags=re.MULTILINE)
        scores.append(list(analyser.polarity_scores(text).values()))
    return np.array(scores)

In [12]:
text_sentiment = sentiment(text_data)

In [13]:
positive = [t_s[0] for t_s in text_sentiment]
neutral = [t_s[1] for t_s in text_sentiment]
negative = [t_s[2] for t_s in text_sentiment]
compound = [t_s[3] for t_s in text_sentiment]

In [14]:
disaster['sentiment_score_positive'] = positive
disaster['sentiment_score_neutral'] = neutral
disaster['sentiment_score_negative'] = negative
disaster['sentiment_score_compound'] = compound

In [15]:
disaster.head()

Unnamed: 0,id,keyword,location,text,target,clean_text,sentiment_score_positive,sentiment_score_neutral,sentiment_score_negative,sentiment_score_compound
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[our, deed, reason, #, earthquake, may, allah,...",0.0,0.851,0.149,0.2732
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, ., canada]",0.286,0.714,0.0,-0.34
2,5,,,All residents asked to 'shelter in place' are ...,1,"[all, resident, asked, 'shelter, place, ', not...",0.095,0.905,0.0,-0.296
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfire, evacuat...",0.0,1.0,0.0,0.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, sent, photo, ruby, #, alaska, smok...",0.0,1.0,0.0,0.0


In [16]:
# word_count
disaster['word_count'] = disaster['clean_text'].apply(lambda x: len(x))
# hashtag_count
disaster['hashtag_count'] = disaster['clean_text'].apply(lambda x: len([c for c in x if c == '#']))
#df_test['hashtag_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention_count
disaster['mention_count'] = disaster['clean_text'].apply(lambda x: len([c for c in x if c == '@']))
#df_test['mention_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

In [17]:
cleaned_text = text_data

In [18]:
cleaned_text

array([[1,
        'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'],
       [4, 'Forest fire near La Ronge Sask. Canada'],
       [5,
        "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"],
       ...,
       [10871,
        'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ'],
       [10872,
        'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.'],
       [10873,
        'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d']],
      dtype=object)

In [19]:
def clean_text_tfidf(arr):
    cleaned_texts = []
    for row in arr:
        text = "".join((char for char in row[1] if char not in string.punctuation))
        text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'^http?:\/\/.*[\r\n]*', '', row[1], flags=re.MULTILINE)
        text = word_tokenize(text)
        text = [lemmatizer.lemmatize(w.lower()) for w in text if not w in stop_words]
        text = ' '.join(text)
        text = text.replace('# ','#')
        cleaned_texts.append(text)
        
    return np.array(cleaned_texts)

In [20]:
cleaned_text = clean_text_tfidf(cleaned_text)

In [21]:
cleaned_text

array(['our deed reason #earthquake may allah forgive u',
       'forest fire near la ronge sask . canada',
       "all resident asked 'shelter place ' notified officer . no evacuation shelter place order expected",
       ...,
       'm1.94 [ 01:04 utc ] ? 5km s volcano hawaii . http : //t.co/zdtoyd8ebj',
       'police investigating e-bike collided car little portugal . e-bike rider suffered serious non-life threatening injury .',
       'the latest : more home razed northern california wildfire - abc news http : //t.co/ymy4rskq3d'],
      dtype='<U195')

In [22]:
# put it in tfidf vectorizer
tfidf = TfidfVectorizer(min_df = 5, ngram_range = (2,2))
features = tfidf.fit_transform(cleaned_text)

In [23]:
features_df = pd.DataFrame(features.todense(),columns = tfidf.get_feature_names())

In [24]:
features_df.shape

(7613, 1227)

In [25]:
disaster.reset_index(drop=True, inplace=True)
features_df.reset_index(drop=True, inplace=True)

In [26]:
disaster = pd.concat([disaster,features_df],axis = 1)

In [27]:
disaster.shape

(7613, 1240)

In [28]:
disaster.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'clean_text',
       'sentiment_score_positive', 'sentiment_score_neutral',
       'sentiment_score_negative', 'sentiment_score_compound',
       ...
       'youtube video', 'z10 full', 'û_ http', 'ûª http', 'ûª israel',
       'ûªs first', 'ûªs stock', 'ûªt let', 'ûªve home', 'ûïwhen saw'],
      dtype='object', length=1240)

In [29]:
#disaster_drop = disaster.drop(['id','keyword','location','text'],axis = 1)

In [30]:
disaster_drop = disaster.drop(['id','keyword','location','text','clean_text'],axis = 1)

### encode sentence

In [31]:
clean_text_list = []
for t in cleaned_text:
    clean_text_list.extend(t.split(' '))

In [32]:
clean_list = []
for t in cleaned_text:
    lst = t.split(' ')
    clean_list.append(lst)

In [33]:
clean_list[0]

['our', 'deed', 'reason', '#earthquake', 'may', 'allah', 'forgive', 'u']

In [34]:
from collections import Counter
counts = Counter(clean_text_list)

In [35]:
len(counts.keys())

22451

In [36]:
for word in list(counts):
    if counts[word] < 2:
        del counts[word]

In [37]:
len(counts.keys())

6494

In [38]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [39]:
def encode_sentence(text, vocab2index, N=20, padding_start=True):
    # text = clean_list[i]
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in text])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [40]:
encode_sentence(clean_list[0], vocab2index, N=40, padding_start=False)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32), 8)

### More on Hashtags

In [28]:
def hashtags(cleaned_texts):
    hashtags = []
    for row in cleaned_text:
        text = re.findall(r"#(\w+)", row)
        hashtags.append(text)
        
    return np.array(hashtags)

In [29]:
## Analyzes Hashtags
hashtags_col = pd.Series(hashtags(cleaned_text))
dank = pd.concat([hashtags_col,disaster.target],axis=1).reset_index()

In [30]:
dank.columns

Index(['index', 0, 'target'], dtype='object')

In [31]:
dank.loc[dank['target']==1][0].value_counts()

[]                       2419
[hot, prebreak, best]      13
[news]                     12
[worldnews]                 8
[hiroshima]                 8
                         ... 
[163]                       1
[wish]                      1
[theneeds, recipe]          1
[socal, realhiphop]         1
[fedex]                     1
Name: 0, Length: 660, dtype: int64

In [32]:
dank.loc[dank['target']==0][0].value_counts()

[]                         3470
[hot, prebreak, best]        17
[gbbo]                       11
[nowplaying]                 10
[beyhive]                     8
                           ... 
[pp15000266818, pdx911]       1
[st]                          1
[unfml, deluge]               1
[kca, votejkt48id]            1
[poster, ergo, cuff]          1
Name: 0, Length: 698, dtype: int64

> # model

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import f1_score

> ## baseline

In [42]:
X = np.array(disaster_drop.drop('target', axis = 1))
y = np.array(disaster_drop.target)

In [43]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
x_train

array([[0.207, 0.536, 0.257, ..., 0.   , 0.   , 0.   ],
       [0.181, 0.819, 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 0.857, 0.143, ..., 0.   , 0.   , 0.   ],
       ...,
       [0.155, 0.845, 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 1.   , 0.   , ..., 0.   , 0.   , 0.   ],
       [0.   , 1.   , 0.   , ..., 0.   , 0.   , 0.   ]])

In [45]:
y_train

array([1, 0, 1, ..., 0, 1, 1])

> ### Creates RandomizedSearch/ GridSearch CV objects

In [213]:
## Defines Dictionaries for possible values to choose from
ada_boost_dict = {'n_estimators': np.linspace(50, 500, num=46, dtype = int), 
                  'learning_rate':np.linspace(0.05,1,num=20), 
                  'algorithm' : ['SAMME', 'SAMME.R']}

log_dict = {'penalty': ['l2','none'], 
            'fit_intercept': [True, False],
           'C': np.linspace(0,5,21)}

rf_dict = {'n_estimators': np.linspace(50, 1000, num=96, dtype = int),
          'criterion':['gini','entropy'],
          'min_samples_split':np.linspace(5,50,11, dtype = int),
          'max_features':['auto','sqrt','log2',None],
          'bootstrap':[True, False]}

In [214]:
adaboost = AdaBoostClassifier()
rf = RandomForestClassifier(n_jobs=-1)
log_reg = LogisticRegression(multi_class = 'ovr', max_iter = 1000)

In [215]:
ada_model = RandomizedSearchCV(adaboost, ada_boost_dict, 60, 
                               random_state = 42, cv = 3)
rf_model = RandomizedSearchCV(rf, rf_dict, 300, 
                               random_state = 42, cv = 3)
log_model = GridSearchCV(log_reg, log_dict, cv = 3)

> ### Finds best parameters for each model

In [216]:
ada_model.fit(x_train, y_train)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                                base_estimator=None,
                                                learning_rate=1.0,
                                                n_estimators=50,
                                                random_state=None),
                   iid='deprecated', n_iter=60, n_jobs=None,
                   param_distributions={'algorithm': ['SAMME', 'SAMME.R'],
                                        'learning_rate': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]),
                                        'n_estimators': array([ 50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170,
       180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300,
       310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430,
       440, 450, 46

In [217]:
rf_model.fit(x_train, y_train)

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [218]:
log_model.fit(x_train, y_train)

  args=(X, target, 1. / C, sample_weight),
  out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)
  grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

  args=(X, target, 1. / C, sample_weight),
  out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)
  grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

  args=(X, target, 1. / C, sample_weight),
  out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)
  grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  "Setting penalty='none' will ignore the C and l1_ratio "
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to 

GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='ovr',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  , 2.25, 2.5 ,
       2.75, 3.  , 3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75, 5.  ]),
                         'fit_intercept': [True, False],
                         'penalty': ['l2', 'none']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=F

In [219]:
best_ada_params = ada_model.best_params_
best_log_params = log_model.best_params_
best_rf_params = rf_model.best_params_

> ### Determines the best model

In [220]:
best_log_reg = LogisticRegression(**best_log_params, multi_class = 'ovr', max_iter = 1000)
best_ada = AdaBoostClassifier(**best_ada_params)
best_rf = RandomForestClassifier(**best_rf_params, n_jobs=-1)

In [221]:
best_rf.fit(x_train, y_train)
best_log_reg.fit(x_train, y_train)
best_ada.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.6,
                   n_estimators=280, random_state=None)

In [222]:
y_pred_rf = best_rf.predict(x_test)
y_pred_ada = best_ada.predict(x_test)
y_pred_log_reg = best_log_reg.predict(x_test)

In [223]:
f1_score(y_pred_rf, y_test)

0.6352313167259785

In [332]:
sum(y_pred_rf==y_test)/len(y_test)

0.7307944845699278

In [224]:
f1_score(y_pred_ada, y_test)

0.6215722120658136

In [333]:
sum(y_pred_ada==y_test)/len(y_test)

0.7281680892974393

In [225]:
f1_score(y_pred_log_reg, y_test)

0.6272401433691756

In [334]:
sum(y_pred_log_reg==y_test)/len(y_test)

0.726854891661195

In [193]:
x1 = [encode_sentence(c_l, vocab2index, N=40, padding_start=False) for c_l in clean_list]

In [194]:
x, s = x1[0]

### dataset & model

In [88]:
class DisasterDataset(Dataset):
    def __init__(self, clean_list, y, padding_start=True, N=40):
        self.X1 = [encode_sentence(c_l, vocab2index, N=40, padding_start=False) for c_l in clean_list]
        #self.X2 = X
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x1,s = self.X1[idx]
        #x2 = self.X2[idx]
        return x1, s, self.y[idx]
    
train_ds = DisasterDataset(clean_list, y_train)
valid_ds = DisasterDataset(clean_list, y_val)

In [89]:
b_size = 100
train_dl = DataLoader(train_ds, shuffle=True, batch_size=b_size)
valid_dl = DataLoader(valid_ds, batch_size=b_size)

In [90]:
x1, s, y = next(iter(train_dl))

In [49]:
x1.shape

torch.Size([100, 40])

In [57]:
s.shape

torch.Size([100])

In [50]:
def update_optimizer(optimizer, lr):
    for i, param_group in enumerate(optimizer.param_groups):
        param_group["lr"] = lr

In [123]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__() 
        self.hidden_dim = hidden_dim
        #self.numerical_dim = numerical_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        # +numerical_dim
        #input_dim = numerical_dim + embedding_dim
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x1):
        #x1 = x1.view(len(x1),-1)
        #x = torch.cat((x1,x2.long()), dim=1)
        x = self.embeddings(x1)
        x = self.dropout(x)
        out, h = self.gru(x)
        return self.linear(h[-1])
    

class LSTMModel(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super(LSTMModel,self).__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        
    def forward(self, x, s):
        # sorting
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.numpy().tolist()
        x = x[sort_index]
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).cuda(), out)

In [124]:
device = torch.device("cuda:6" if torch.cuda.is_available() else "cpu")

In [125]:
def train_epocs_gru(model, optimizer, train_dl, val_dl, epochs=10):
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x1, s, y in train_dl:
            x1 = x1.long().cuda()
            #x2 = x2.float().cuda()
            #s = s.float().cuda()
            y = y.float().cuda()
            y_pred = model(x1)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics_gru(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [126]:
def val_metrics_gru(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x1, s, y in train_dl:
        x1 = x1.long().cuda()
        #x2 = x2.float().cuda()
        #s = s.float().cuda()
        y = y.float().unsqueeze(1).cuda()
        y_hat = model(x1)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [130]:
vocab_size = len(words)
print(vocab_size)

gru_model = GRUModel(vocab_size, 50,50).cuda()

parameters = filter(lambda p: p.requires_grad, gru_model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

6496


In [131]:
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.683 val loss 0.673 and val accuracy 0.587
train loss 0.567 val loss 0.455 and val accuracy 0.794
train loss 0.461 val loss 0.328 and val accuracy 0.860
train loss 0.408 val loss 0.292 and val accuracy 0.879
train loss 0.371 val loss 0.249 and val accuracy 0.894
train loss 0.351 val loss 0.234 and val accuracy 0.903


In [132]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.325 val loss 0.217 and val accuracy 0.910
train loss 0.312 val loss 0.209 and val accuracy 0.911
train loss 0.308 val loss 0.204 and val accuracy 0.913
train loss 0.323 val loss 0.200 and val accuracy 0.915
train loss 0.303 val loss 0.196 and val accuracy 0.917
train loss 0.290 val loss 0.193 and val accuracy 0.919


In [133]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.293 val loss 0.190 and val accuracy 0.918
train loss 0.299 val loss 0.189 and val accuracy 0.921
train loss 0.289 val loss 0.186 and val accuracy 0.920
train loss 0.289 val loss 0.184 and val accuracy 0.921
train loss 0.284 val loss 0.180 and val accuracy 0.924
train loss 0.279 val loss 0.179 and val accuracy 0.925


In [134]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.279 val loss 0.177 and val accuracy 0.924
train loss 0.283 val loss 0.175 and val accuracy 0.925
train loss 0.271 val loss 0.173 and val accuracy 0.926
train loss 0.271 val loss 0.170 and val accuracy 0.926
train loss 0.264 val loss 0.168 and val accuracy 0.925
train loss 0.262 val loss 0.165 and val accuracy 0.928


In [135]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.265 val loss 0.164 and val accuracy 0.929
train loss 0.258 val loss 0.162 and val accuracy 0.928
train loss 0.264 val loss 0.162 and val accuracy 0.929
train loss 0.259 val loss 0.160 and val accuracy 0.930
train loss 0.255 val loss 0.159 and val accuracy 0.929
train loss 0.259 val loss 0.158 and val accuracy 0.932


In [136]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.257 val loss 0.157 and val accuracy 0.933
train loss 0.247 val loss 0.155 and val accuracy 0.933
train loss 0.251 val loss 0.154 and val accuracy 0.933
train loss 0.242 val loss 0.152 and val accuracy 0.934
train loss 0.255 val loss 0.151 and val accuracy 0.933
train loss 0.244 val loss 0.149 and val accuracy 0.934


In [137]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.243 val loss 0.147 and val accuracy 0.935
train loss 0.243 val loss 0.145 and val accuracy 0.936
train loss 0.248 val loss 0.144 and val accuracy 0.937
train loss 0.243 val loss 0.142 and val accuracy 0.938
train loss 0.234 val loss 0.142 and val accuracy 0.938
train loss 0.239 val loss 0.141 and val accuracy 0.937


In [138]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.229 val loss 0.139 and val accuracy 0.939
train loss 0.232 val loss 0.138 and val accuracy 0.939
train loss 0.225 val loss 0.137 and val accuracy 0.941
train loss 0.223 val loss 0.137 and val accuracy 0.941
train loss 0.224 val loss 0.135 and val accuracy 0.941
train loss 0.218 val loss 0.134 and val accuracy 0.940


In [139]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.235 val loss 0.134 and val accuracy 0.941
train loss 0.223 val loss 0.132 and val accuracy 0.941
train loss 0.225 val loss 0.131 and val accuracy 0.942
train loss 0.218 val loss 0.130 and val accuracy 0.943
train loss 0.221 val loss 0.130 and val accuracy 0.944
train loss 0.219 val loss 0.129 and val accuracy 0.943


In [140]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.212 val loss 0.128 and val accuracy 0.944
train loss 0.219 val loss 0.128 and val accuracy 0.943
train loss 0.207 val loss 0.127 and val accuracy 0.944
train loss 0.212 val loss 0.127 and val accuracy 0.943
train loss 0.206 val loss 0.125 and val accuracy 0.944
train loss 0.212 val loss 0.125 and val accuracy 0.944


In [141]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.206 val loss 0.124 and val accuracy 0.944
train loss 0.207 val loss 0.123 and val accuracy 0.945
train loss 0.200 val loss 0.122 and val accuracy 0.944
train loss 0.202 val loss 0.123 and val accuracy 0.944
train loss 0.202 val loss 0.120 and val accuracy 0.945
train loss 0.197 val loss 0.119 and val accuracy 0.946


In [142]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.200 val loss 0.118 and val accuracy 0.946
train loss 0.198 val loss 0.118 and val accuracy 0.946
train loss 0.202 val loss 0.117 and val accuracy 0.946
train loss 0.198 val loss 0.117 and val accuracy 0.946
train loss 0.190 val loss 0.116 and val accuracy 0.946
train loss 0.197 val loss 0.115 and val accuracy 0.946


In [143]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.192 val loss 0.115 and val accuracy 0.945
train loss 0.196 val loss 0.115 and val accuracy 0.947
train loss 0.190 val loss 0.114 and val accuracy 0.947
train loss 0.190 val loss 0.113 and val accuracy 0.947
train loss 0.187 val loss 0.112 and val accuracy 0.947
train loss 0.186 val loss 0.112 and val accuracy 0.947


In [144]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.189 val loss 0.111 and val accuracy 0.947
train loss 0.195 val loss 0.111 and val accuracy 0.948
train loss 0.179 val loss 0.110 and val accuracy 0.947
train loss 0.186 val loss 0.110 and val accuracy 0.948
train loss 0.181 val loss 0.109 and val accuracy 0.948
train loss 0.184 val loss 0.110 and val accuracy 0.948


In [145]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.182 val loss 0.110 and val accuracy 0.947
train loss 0.185 val loss 0.110 and val accuracy 0.948
train loss 0.180 val loss 0.109 and val accuracy 0.948
train loss 0.177 val loss 0.109 and val accuracy 0.948
train loss 0.176 val loss 0.108 and val accuracy 0.948
train loss 0.169 val loss 0.107 and val accuracy 0.949


In [146]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.181 val loss 0.107 and val accuracy 0.949
train loss 0.176 val loss 0.106 and val accuracy 0.948
train loss 0.170 val loss 0.107 and val accuracy 0.948
train loss 0.172 val loss 0.107 and val accuracy 0.949
train loss 0.174 val loss 0.105 and val accuracy 0.949
train loss 0.174 val loss 0.104 and val accuracy 0.950


In [147]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.180 val loss 0.104 and val accuracy 0.949
train loss 0.176 val loss 0.104 and val accuracy 0.949
train loss 0.175 val loss 0.104 and val accuracy 0.950
train loss 0.170 val loss 0.104 and val accuracy 0.949
train loss 0.167 val loss 0.104 and val accuracy 0.950
train loss 0.164 val loss 0.104 and val accuracy 0.950


In [148]:
update_optimizer(optimizer, lr=0.001)
train_epocs_gru(gru_model, optimizer, train_dl, valid_dl, epochs=30)

train loss 0.166 val loss 0.104 and val accuracy 0.949
train loss 0.166 val loss 0.103 and val accuracy 0.949
train loss 0.164 val loss 0.102 and val accuracy 0.950
train loss 0.164 val loss 0.102 and val accuracy 0.950
train loss 0.164 val loss 0.101 and val accuracy 0.950
train loss 0.169 val loss 0.101 and val accuracy 0.950
