In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
import re
warnings.filterwarnings('ignore')
from pattern.en import spelling

In [2]:
import re

In [3]:
def reduce_lengthening(text):
    pattern = re.compile(r"([a-zA-Z])\1{2,}")
    return pattern.sub(r"\1\1", text)

In [58]:
def remove_extra_spaces(text):
    pattern = re.compile(r"(\s)\1{2,}")
    return pattern.sub(r"\1", text)

In [66]:
text = "       asdrfr    "
print(remove_extra_spaces(text))

 asdrfr 


In [118]:
from pattern.en import suggest
sentence = "The score has gone finalllllll"
word = "loool"
word_wlf = reduce_lengthening(word) #calling function defined above
print(word_wlf) #word lengthening isn't being able to fix it completely

correct_word = suggest(word_wlf) 
print(correct_word)

lool
[('look', 0.7941176470588235), ('fool', 0.07282913165266107), ('wool', 0.058823529411764705), ('pool', 0.036414565826330535), ('cool', 0.015406162464985995), ('tool', 0.00980392156862745), ('loop', 0.0056022408963585435), ('loot', 0.004201680672268907), ('loom', 0.0028011204481792717)]


In [4]:
import spacy

def lemmatize(tweet, nlp):
    doc = nlp(tweet)
    newSentence = " ".join([token.lemma_ for token in doc])
    return(newSentence)

In [5]:
def spellCheck(tweet):
    tokenz = word_tokenize(tweet)
    newSentence = ""
    for token in tokenz:
        correct_word = suggest(token)
        newWord = correct_word[0][0]
        newSentence += newWord + " "
    newSentence = newSentence[:-1]
    return(newSentence)

In [124]:
import wordninja
def splitWords(tweet):
    doc = nlp(tweet)
    newSentence = " ".join([" ".join(wordninja.split(str(token))) for token in doc])
    return(newSentence)

In [125]:
import nltk

In [2]:
dataset = pd.read_csv('./data/Kaggle/train.csv',delimiter=',',\
                           names=['id','keyword','location', 'text','target'])
dataset = dataset.drop(0)
dataset.head()

Unnamed: 0,id,keyword,location,text,target
1,1,,,Our Deeds are the Reason of this #earthquake M...,1
2,4,,,Forest fire near La Ronge Sask. Canada,1
3,5,,,All residents asked to 'shelter in place' are ...,1
4,6,,,"13,000 people receive #wildfires evacuation or...",1
5,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [127]:
# Drop Id, Keyword, Location
dataset = dataset.drop(labels=['id', 'keyword','location'], axis=1)
dataset.head()

Unnamed: 0,text,target
0,text,target
1,Our Deeds are the Reason of this #earthquake M...,1
2,Forest fire near La Ronge Sask. Canada,1
3,All residents asked to 'shelter in place' are ...,1
4,"13,000 people receive #wildfires evacuation or...",1


In [128]:
import string
#from string import maketrans
def clean(tweet,nlp):
    # Special characters
    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)
    tweet = re.sub(r"amp", "and", tweet)
    tweet = re.sub(r"\n", "", tweet)
    tweet = re.sub(r"\r", "", tweet)
    tweet = tweet.lower()
    tweet = reduce_lengthening(tweet)
    
    tweet = re.sub(r"x\d+", "", tweet) 
    tweet = re.sub(r"\d", "", tweet) 
    tweet = re.sub(r"\u0089ã¢", "", tweet)
    tweet = re.sub(r"\s{2,}", " ", tweet)
    # Remove http
    tweet = re.sub(r"http[^\s]+","", tweet)
    tweet = re.sub(r"http","", tweet)
    tweet = re.sub(r"youtube","", tweet)
    # Remove @abc
    tweet = re.sub(r"@[^\s]+", "", tweet)
    tweet = tweet.translate(str.maketrans('','',string.punctuation))
    tweet = lemmatize(tweet,nlp)
    tweet = remove_extra_spaces(tweet)
    tweet = re.sub(r"^\s+","", tweet)
    tweet = splitWords(tweet)
    
    return tweet

In [129]:
# Drop first row
dataset = dataset.drop(index=0)
# Clean data
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])
dataset['text_cleaned'] = dataset['text'].apply(lambda s : clean(s,nlp))

In [130]:
dataset.head()

Unnamed: 0,text,target,text_cleaned
1,Our Deeds are the Reason of this #earthquake M...,1,PRON deed be the reason of this earthquake may...
2,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong e s ask canada
3,All residents asked to 'shelter in place' are ...,1,all resident ask to shelter in place be be not...
4,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order in ca...
5,Just got sent this photo from Ruby #Alaska as ...,1,just get send this photo from ruby alaska as s...


In [131]:
dataset['text_cleaned'] = dataset['text_cleaned'].drop_duplicates()
dataset['text_cleaned'].replace('', np.nan, inplace=True)
dataset.dropna(subset=['text_cleaned'], inplace=True)

In [132]:
dataset.head()

Unnamed: 0,text,target,text_cleaned
1,Our Deeds are the Reason of this #earthquake M...,1,PRON deed be the reason of this earthquake may...
2,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong e s ask canada
3,All residents asked to 'shelter in place' are ...,1,all resident ask to shelter in place be be not...
4,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order in ca...
5,Just got sent this photo from Ruby #Alaska as ...,1,just get send this photo from ruby alaska as s...


In [133]:
dataset['text_cleaned'].to_csv("dataset_cleaned2.csv")

In [None]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
dataset['tokenized'] = dataset['text_cleaned'].apply(tt.tokenize)
dataset.head()

In [None]:
wordList = sum(dataset['tokenized'].values,[])
wordList

In [None]:
freDist = nltk.FreqDist(wordList)

In [None]:
freDist

In [None]:
freDist.keys()

### Try Tokenized BERT Embeding and Create Model

In [3]:
def pad_token_list(sample):
    # Find the sentence with the max length
    max_len = 0
    for token_list in sample:
        if len(token_list) > max_len:
            max_len = len(token_list)
    # Adjust every sentence to the same length
    padded = np.array([token_list + [0]*(max_len-len(token_list)) for token_list in sample])
    return padded, max_len

In [4]:
def get_embeddings_from_sample(sample, model):
    # Pad sample data:
#     sample = pad_token_list(sample)
    # Define mask from data: - 0 token entry     -> padding, set mask entry to 0
    #                        - non-0 token entry -> valid word, set mask entry to 1
    mask = np.where(sample != 0, 1, 0)
    
    # Create tensor objects from numpy arrays
    input_ids = torch.tensor(sample).long()
    attention_mask = torch.tensor(mask).long()

    # Use BERT model to get embeddings
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    # Extract [CLS] embedding for each sample as numpy array to be used for classification task
    features = last_hidden_states[0][:,0,:].numpy()
    return features, mask

In [5]:
clean_dataset = pd.read_csv('dataset_cleaned2.csv',header=None)

In [6]:
clean_dataset.head()
#clean_dataset['target'] = dataset.loc()

Unnamed: 0,0,1
0,1,PRON deed be the reason of this earthquake may...
1,2,forest fire near la rong e s ask canada
2,3,all resident ask to shelter in place be be not...
3,4,people receive wildfire evacuation order in ca...
4,5,just get send this photo from ruby alaska as s...


In [20]:
dataset.loc[1]

id                                                          1
keyword                                                   NaN
location                                                  NaN
text        Our Deeds are the Reason of this #earthquake M...
target                                                      1
Name: 1, dtype: object

In [7]:
clean_dataset['target'] = dataset['target'].loc[clean_dataset[0]].tolist()

In [8]:
dataset = clean_dataset
dataset.columns = ['ID','text_cleaned','target']
#clean_dataset[0]
dataset.head()

Unnamed: 0,ID,text_cleaned,target
0,1,PRON deed be the reason of this earthquake may...,1
1,2,forest fire near la rong e s ask canada,1
2,3,all resident ask to shelter in place be be not...,1
3,4,people receive wildfire evacuation order in ca...,1
4,5,just get send this photo from ruby alaska as s...,1


In [9]:
sample_size = 4000
random_sample = dataset.sample(n=sample_size)
random_sample.shape

(4000, 3)

In [10]:
val_set = dataset.loc[dataset.index.difference(random_sample.index)]
val_set.shape

(2835, 3)

In [11]:
val_sample_size = 1500
val_random_sample = val_set.sample(n=val_sample_size)
val_random_sample.shape
test_set = val_set.loc[val_set.index.difference(val_random_sample.index)]
test_set.shape

(1335, 3)

In [12]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [13]:
sample_tokenized = random_sample['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
#sample_tokenized2 = random_sample2['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
val_random_sample_tokenized = val_random_sample['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
#test_tokenized = test_set['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [14]:
sample_padded, sample_len = pad_token_list(sample_tokenized.values)
val_padded, val_len = pad_token_list(val_random_sample_tokenized.values)
#sample_padded2, sample_len2 = pad_token_list(sample_tokenized2.values)
#test_padded, test_len = pad_token_list(test_tokenized.values)

In [15]:
sample_features, mask = get_embeddings_from_sample(sample_padded, model)
val_features, mask = get_embeddings_from_sample(val_padded, model)

In [16]:
sample_features.shape

(4000, 768)

In [17]:
# Create cuda device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [46]:
train_features_tensor = torch.tensor(np.asarray(sample_features))
train_features_tensor = train_features_tensor.to(device)
train_labels_tensor =  torch.FloatTensor(np.asarray(random_sample['target']).astype(np.float))
train_labels_tensor = train_labels_tensor.to(device)

val_features_tensor = torch.tensor(np.asarray(val_features))
val_features_tensor = val_features_tensor.to(device)
val_labels_tensor =  torch.FloatTensor(np.asarray(val_random_sample['target']).astype(np.float))
val_labels_tensor = val_labels_tensor.to(device)

In [19]:
# Define neural network class to be trained
# Structure:
# input -> fc1 -> sigmoid -> out -> log_softmax
import torch.nn as nn
import torch.nn.functional as F
class Shallow_Network(nn.Module):
    def __init__(self):
        super(Shallow_Network,self).__init__()
        self.fc1 = nn.Linear(768,1000)
        self.out = nn.Linear(1000,2)
    def forward(self,input):
        # Take input, feed through fc1 layer,
        # then apply activation function to it
        x = F.sigmoid(self.fc1(input))
        # Take output of sigmoid, input into out layer,
        # and apply log_softmax function
        return (F.log_softmax(self.out(x),dim=1))

In [50]:
class Medium_Network(nn.Module):
    def __init__(self):
        super(Medium_Network,self).__init__()
        self.fc1 = nn.Linear(768,1000)
        self.fc2 = nn.Linear(1000,5000)
        self.fc3 = nn.Linear(5000,1000)
        self.out = nn.Linear(1000,1)
    def forward(self,input):
        # Take input, feed through fc1 layer,
        # then apply activation function to it
        x = F.relu(self.fc1(input))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.sigmoid(self.out(x))
        return(x)
        # Take output of sigmoid, input into out layer,
        # and apply log_softmax function
        #return (F.log_softmax(self.out(x),dim=1))

In [84]:
# Create neural network object
#net = Shallow_Network()
net = Medium_Network()
net = net.to(device)

In [85]:
import torch.optim as optim
#Create an stochastic gradient descent optimizer
adam = optim.Adam(net.parameters(), lr=0.001)
#loss_func = nn.NLLLoss()
loss_func = nn.BCELoss()
loss_func = loss_func.to(device)

In [83]:
def accuracy(net,features,labels):
    # Get classification probabilities from hidden state array
    # And apply Softmax
    with torch.no_grad():
        probs = net(features)
        #softprobs = F.softmax(probs)
    # Get most likely class and its index for each sample point
    #values, indices = torch.max(softprobs,1)
    values = torch.round(probs)
    # Calculate number of sample points where prediction failed
    #nums = torch.sum(torch.abs(labels-indices)).detach().cpu().numpy()
    nums = torch.sum(torch.abs(torch.t(torch.round(probs)) - labels))
    #nums = torch.sum(torch.abs(labels-values)).detach().cpu().numpy()
    # Number of correct predictions
    numcorrect = len(labels)-(nums+0)
    # Accuracy of prediction
    accuracy = numcorrect/len(labels)
    return(accuracy)

In [86]:
# Train network
cnt = 0
average_losses = []
average_val_losses = []
acc = []
cur_loss = []
min_validation = 10000.0
min_val_epoch = 0
for epoch in range(400):
    net.train()
    #zero the gradient
    adam.zero_grad()
    #Get output of network
    probs = net(train_features_tensor)
    #compute loss
    loss = loss_func(probs,train_labels_tensor)
    #compute the backward gradient and move network in that direction
    loss.backward()
    adam.step()
    #gather loss
    cur_loss.append(loss.detach().cpu().numpy())
    print("epoch ",epoch)
    print("training loss: ", np.mean(cur_loss))
    net.eval()
    probs_val = net(val_features_tensor)
    loss_val = loss_func(probs_val,val_labels_tensor)
    print("validation loss: ", np.mean(loss_val.detach().cpu().numpy()))
    print("validation accuracy: ", accuracy(net,val_features_tensor,val_labels_tensor))
    #Save model if validation is min
    if min_validation > np.mean(loss_val.detach().cpu().numpy()):
        min_validation = np.mean(loss_val.detach().cpu().numpy())
        min_val_epoch = epoch
        torch.save(net.state_dict(), './net_parameters_kaggle.pth')
    

epoch  0
training loss:  0.69556385
validation loss:  0.79055023
validation accuracy:  tensor(0.5940, device='cuda:0')
epoch  1
training loss:  0.74694085
validation loss:  0.73043406
validation accuracy:  tensor(0.4060, device='cuda:0')
epoch  2
training loss:  0.74040556
validation loss:  0.70251614
validation accuracy:  tensor(0.4060, device='cuda:0')
epoch  3
training loss:  0.73011845
validation loss:  0.66950536
validation accuracy:  tensor(0.4080, device='cuda:0')
epoch  4
training loss:  0.7170331
validation loss:  0.6426097
validation accuracy:  tensor(0.6567, device='cuda:0')
epoch  5
training loss:  0.70324403
validation loss:  0.6124698
validation accuracy:  tensor(0.7280, device='cuda:0')
epoch  6
training loss:  0.68877107
validation loss:  0.58187103
validation accuracy:  tensor(0.7633, device='cuda:0')
epoch  7
training loss:  0.6741551
validation loss:  0.5540282
validation accuracy:  tensor(0.7747, device='cuda:0')
epoch  8
training loss:  0.6596357
validation loss:  

validation loss:  0.49185696
validation accuracy:  tensor(0.7927, device='cuda:0')
epoch  71
training loss:  0.42536628
validation loss:  0.4850141
validation accuracy:  tensor(0.8000, device='cuda:0')
epoch  72
training loss:  0.4230314
validation loss:  0.5238317
validation accuracy:  tensor(0.7787, device='cuda:0')
epoch  73
training loss:  0.42104468
validation loss:  0.5232992
validation accuracy:  tensor(0.7893, device='cuda:0')
epoch  74
training loss:  0.4191619
validation loss:  0.4994506
validation accuracy:  tensor(0.7933, device='cuda:0')
epoch  75
training loss:  0.41676024
validation loss:  0.52637297
validation accuracy:  tensor(0.7860, device='cuda:0')
epoch  76
training loss:  0.41459396
validation loss:  0.5415434
validation accuracy:  tensor(0.7860, device='cuda:0')
epoch  77
training loss:  0.41278946
validation loss:  0.5178068
validation accuracy:  tensor(0.7880, device='cuda:0')
epoch  78
training loss:  0.41039214
validation loss:  0.5357167
validation accuracy:

validation loss:  1.2629489
validation accuracy:  tensor(0.7693, device='cuda:0')
epoch  143
training loss:  0.27405432
validation loss:  1.272558
validation accuracy:  tensor(0.7633, device='cuda:0')
epoch  144
training loss:  0.27223447
validation loss:  1.3146169
validation accuracy:  tensor(0.7540, device='cuda:0')
epoch  145
training loss:  0.2704475
validation loss:  1.3227218
validation accuracy:  tensor(0.7587, device='cuda:0')
epoch  146
training loss:  0.26867032
validation loss:  1.332713
validation accuracy:  tensor(0.7693, device='cuda:0')
epoch  147
training loss:  0.26691562
validation loss:  1.350672
validation accuracy:  tensor(0.7680, device='cuda:0')
epoch  148
training loss:  0.265184
validation loss:  1.3721251
validation accuracy:  tensor(0.7667, device='cuda:0')
epoch  149
training loss:  0.26346436
validation loss:  1.3984272
validation accuracy:  tensor(0.7540, device='cuda:0')
epoch  150
training loss:  0.26176956
validation loss:  1.4105804
validation accurac

validation loss:  2.0003586
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  215
training loss:  0.18357198
validation loss:  2.0206406
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  216
training loss:  0.18273023
validation loss:  2.0259652
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  217
training loss:  0.18189615
validation loss:  2.03091
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  218
training loss:  0.18106967
validation loss:  2.0353503
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  219
training loss:  0.1802507
validation loss:  2.0407476
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  220
training loss:  0.17943911
validation loss:  2.046001
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  221
training loss:  0.1786348
validation loss:  2.0503793
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  222
training loss:  0.17783768
validation loss:  2.0551448
validation accura

epoch  286
training loss:  0.13835719
validation loss:  2.3343344
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  287
training loss:  0.13787939
validation loss:  2.3393922
validation accuracy:  tensor(0.7587, device='cuda:0')
epoch  288
training loss:  0.1374049
validation loss:  2.3465195
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  289
training loss:  0.1369337
validation loss:  2.3441365
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  290
training loss:  0.13646571
validation loss:  2.344211
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  291
training loss:  0.1360009
validation loss:  2.3473094
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  292
training loss:  0.13553928
validation loss:  2.350305
validation accuracy:  tensor(0.7607, device='cuda:0')
epoch  293
training loss:  0.13508078
validation loss:  2.3583944
validation accuracy:  tensor(0.7620, device='cuda:0')
epoch  294
training loss:  0.1346254
validati

validation loss:  2.4821227
validation accuracy:  tensor(0.7607, device='cuda:0')
epoch  356
training loss:  0.111373685
validation loss:  2.4803555
validation accuracy:  tensor(0.7620, device='cuda:0')
epoch  357
training loss:  0.111064665
validation loss:  2.485421
validation accuracy:  tensor(0.7600, device='cuda:0')
epoch  358
training loss:  0.110757336
validation loss:  2.4817173
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  359
training loss:  0.11045169
validation loss:  2.4884164
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  360
training loss:  0.11014772
validation loss:  2.4833505
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  361
training loss:  0.10984544
validation loss:  2.491598
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  362
training loss:  0.10954481
validation loss:  2.4926517
validation accuracy:  tensor(0.7613, device='cuda:0')
epoch  363
training loss:  0.10924584
validation loss:  2.494527
validation a

In [80]:
torch.t(torch.round(probs[0:5]))

tensor([[0., 1., 0., 1., 1.]], device='cuda:0', grad_fn=<TBackward>)

In [81]:
val_labels_tensor[0:5]

tensor([1., 0., 0., 0., 0.], device='cuda:0')

In [82]:
torch.sum(torch.abs(torch.t(torch.round(probs[0:5])) - val_labels_tensor[0:5]))

tensor(4., device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
min_val_epoch

In [None]:
net = Shallow_Network()
checkpoint = torch.load('./net_parameters_kaggle.pth')
net.load_state_dict(checkpoint)
net = net.to(device)
net.eval()

In [None]:
probs_val = net(val_features_tensor)
loss_val = loss_func(probs_val,val_labels_tensor)
print("validation loss: ", np.mean(loss_val.detach().cpu().numpy()))

In [None]:
print(accuracy(net,val_features_tensor,val_labels_tensor))

In [None]:
test_set = val_set.loc[val_set.index.difference(val_random_sample.index)]
test_random_sample_tokenized = test_set['text_cleaned'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
test_padded, test_len = pad_token_list(test_random_sample_tokenized.values)
test_features, mask = get_embeddings_from_sample(test_padded, model)
test_features_tensor = torch.tensor(np.asarray(test_features))
test_features_tensor = test_features_tensor.to(device)
test_labels_tensor =  torch.tensor(np.asarray(test_set['target']).astype(np.int))
test_labels_tensor = test_labels_tensor.to(device)

In [None]:
print(accuracy(net,test_features_tensor,test_labels_tensor))

Baseline only html tag and special character cleaning yielded 81.8% 

In [29]:
dataset.shape

(7613, 3)

In [30]:
dataset = dataset.drop_duplicates(subset='text_cleaned')

In [33]:
dataset.shape

(6957, 3)

In [34]:
dataset.head()

Unnamed: 0,text,target,text_cleaned
1,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
2,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
3,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
4,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
5,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...


In [None]:
ã¢
&gt;
&amp;