In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
import re
warnings.filterwarnings('ignore')

In [2]:
import nltk

In [3]:
dataset = pd.read_csv('./data/Kaggle/train.csv',delimiter=',',\
                           names=['id','keyword','location', 'text','target'])
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,id,keyword,location,text,target
1,1,,,Our Deeds are the Reason of this #earthquake M...,1
2,4,,,Forest fire near La Ronge Sask. Canada,1
3,5,,,All residents asked to 'shelter in place' are ...,1
4,6,,,"13,000 people receive #wildfires evacuation or...",1


In [4]:
# Drop Id, Keyword, Location
dataset = dataset.drop(labels=['id', 'keyword','location'], axis=1)
dataset.head()

Unnamed: 0,text,target
0,text,target
1,Our Deeds are the Reason of this #earthquake M...,1
2,Forest fire near La Ronge Sask. Canada,1
3,All residents asked to 'shelter in place' are ...,1
4,"13,000 people receive #wildfires evacuation or...",1


In [10]:
def clean(tweet):
    # Special characters
    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)
     
    # Remove http
    tweet = re.sub(r"http[^\s]+","", tweet)
    
    # Remove @abc
    tweet = re.sub(r"@[^\s]+", "", tweet)
    
    return tweet

In [11]:
# Drop first row
dataset = dataset.drop(index=0)
# Clean data
dataset['text_cleaned'] = dataset['text'].apply(lambda s : clean(s))

In [12]:
dataset.head()

Unnamed: 0,text,target,text_cleaned
1,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
2,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
3,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
4,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
5,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...


In [18]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
dataset['tokenized'] = dataset['text_cleaned'].apply(tt.tokenize)
dataset.head()

Unnamed: 0,text,target,text_cleaned,tokenized
1,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...,"[Our, Deeds, are, the, Reason, of, this, #eart..."
2,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada,"[Forest, fire, near, La, Ronge, Sask, ., Canada]"
3,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...,"[All, residents, asked, to, ', shelter, in, pl..."
4,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or...","[13,000, people, receive, #wildfires, evacuati..."
5,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, #Al..."


In [19]:
wordList = sum(dataset['tokenized'].values,[])
wordList

['Our',
 'Deeds',
 'are',
 'the',
 'Reason',
 'of',
 'this',
 '#earthquake',
 'May',
 'ALLAH',
 'Forgive',
 'us',
 'all',
 'Forest',
 'fire',
 'near',
 'La',
 'Ronge',
 'Sask',
 '.',
 'Canada',
 'All',
 'residents',
 'asked',
 'to',
 "'",
 'shelter',
 'in',
 'place',
 "'",
 'are',
 'being',
 'notified',
 'by',
 'officers',
 '.',
 'No',
 'other',
 'evacuation',
 'or',
 'shelter',
 'in',
 'place',
 'orders',
 'are',
 'expected',
 '13,000',
 'people',
 'receive',
 '#wildfires',
 'evacuation',
 'orders',
 'in',
 'California',
 'Just',
 'got',
 'sent',
 'this',
 'photo',
 'from',
 'Ruby',
 '#Alaska',
 'as',
 'smoke',
 'from',
 '#wildfires',
 'pours',
 'into',
 'a',
 'school',
 '#RockyFire',
 'Update',
 '=',
 '>',
 'California',
 'Hwy',
 '.',
 '20',
 'closed',
 'in',
 'both',
 'directions',
 'due',
 'to',
 'Lake',
 'County',
 'fire',
 '-',
 '#CAfire',
 '#wildfires',
 '#flood',
 '#disaster',
 'Heavy',
 'rain',
 'causes',
 'flash',
 'flooding',
 'of',
 'streets',
 'in',
 'Manitou',
 ',',
 'Col

In [20]:
freDist = nltk.FreqDist(wordList)

In [21]:
freDist

FreqDist({'.': 3286, 'the': 2599, '?': 2191, ':': 1934, 'a': 1883, 'to': 1816, 'in': 1781, 'of': 1736, "'": 1340, 'and': 1317, ...})

In [22]:
freDist.keys()



In [None]:
import transformers as ppb

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights, never_split=wordList)
#model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenizer.encode("didn't")
