In [1]:
import re
from functools import partial
from collections import Counter
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import numpy as np

def removeUnicode(text):
	#Removes unicode strings like "\u002c" and "x96"
	text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
	text = re.sub(r'[^\x00-\x7f]',r'',text)
	return text

def replaceURL(text):
	#Replaces url address with "url" 
	text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','<url>',text)
	text = re.sub(r'#([^\s]+)', r'\1', text)
	return text

def replaceAtUser(text):
	#Replaces "@user" with "atUser"
	text = re.sub('@[^\s]+','<user>',text)
	return text

def removeHashtag(text):
	#Removes hastag in front of a word
	text = re.sub(r'#([^\s]+)', r'\1', text)
	return text

def removeNumbers(text):
	#Removes integers
	text = ''.join([i for i in text if not i.isdigit()])         
	return text

def replaceMulExcl(text):
	#Replaces repetitions of exlamation marks
	text = re.sub(r"(\!)\1+", '!', text)
	return text

def replaceMulQues(text):
	#Replaces repetitions of question marks
	text = re.sub(r"(\?)\1+", '?', text)
	return text

def replaceMulStop(text):
	#Replaces repetitions of stop marks
	text = re.sub(r"(\.)\1+", '.', text)
	return text

def countMulExcl(text):
	#count repetitions of exlamation marks
	return len(re.findall(r"(\!)\1+", text))

def countMulQues(text):
	#Count repetitions of question marks
	return len(re.findall(r"(\?)\1+", text))

def countMulStop(text):
	#Count repetitions of stop marks
	return len(re.findall(r"(\.)\1+", text))

def countElongated(text):
	#count of how many words are elongated
	regex = re.compile(r"(.)\1{2}")
	return len([word for word in text.split() if regex.search(word)])

def countAllCaps(text):
	#count of how many words are all caps
	return len(re.findall("[A-Z0-9]{3,}", text))

#Creates a dictionary with slangs and their equivalents and replaces them
with open('slang.txt') as file:
	slang_map = dict(map(str.strip, line.partition('\t')[::2])
	for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True)
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

#punctuation list for replacing

puncts = [',', '.', '"', ':', ')', '(', '-', '|', ';', "'", '$', '&', '/', '[', ']', '%', '=', '*', '+', '\\', '•',  '~', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def removePuncts(x):
	x = str(x)
	for punct in puncts:
		if punct in x:
			x = x.replace(punct, f' ')
	return x



def countSlang(text):
	# counts how many slang words and a list of found slangs
	slangCounter = 0
	slangsFound = []
	tokens = nltk.word_tokenize(text)
	for word in tokens:
		if word in slang_words:
			slangsFound.append(word)
			slangCounter += 1
	return slangCounter, slangsFound

#Replaces contractions from a string to their equivalents
contraction_patterns = [ (r'I\'m', 'I am'),(r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
						 (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
def replaceContraction(text):
	patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
	for (pattern, repl) in patterns:
		(text, count) = re.subn(pattern, repl, text)
	return text

def replaceElongated(word):
	#Replaces an elongated word with its basic form

	repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
	repl = r'\1\2\3'
	if wordnet.synsets(word):
		return word
	repl_word = repeat_regexp.sub(repl, word)
	if repl_word != word:      
		return replaceElongated(repl_word)
	else:       
		return repl_word

def removeEmoticons(text):
	#Removes emoticons from text 
	text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', text)
	return text

def countEmoticons(text):
	#Input: a text, Output: how many emoticons
	return len(re.findall(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', text))


### Spell Correction begin ###
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('spell_correction.txt').read()))

def P(word, N=sum(WORDS.values())): 
	#P robability of `word`.
	return WORDS[word] / N

def spellCorrection(word): 
	#Most probable spelling correction for word.
	return max(candidates(word), key=P)

def candidates(word): 
	#Generate possible spelling corrections for word.
	return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
	#The subset of `words` that appear in the dictionary of WORDS.
	return set(w for w in words if w in WORDS)

def edits1(word):
	#All edits that are one edit away from `word`.
	letters    = 'abcdefghijklmnopqrstuvwxyz'
	splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
	deletes    = [L + R[1:]               for L, R in splits if R]
	transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
	replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
	inserts    = [L + c + R               for L, R in splits for c in letters]
	return set(deletes + transposes + replaces + inserts)

def edits2(word): 
	#All edits that are two edits away from `word`.
	return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### Spell Correction End ###

In [2]:
import pandas as pd
import numpy as np
import string
from time import time
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 


def k_prep(inputText = 'null'):
	'''
	c = input("\n1.Enter or 2.default or 3.df ? :")

	if c == "2":
		text = "AFRICA,#AFRICANBAZE: asap goaaaaal @Breaking !!!! news:Nigeria oooooooh :-D aren't flag???? wont set ablaze..... 12000 in America. http://t.co/2nndBGwyEi,1"
		#print(text)
	
	elif c == "3" :
		text = inputText

	else:
		text = input("\nEnter the tweet: ")
	'''

	text = inputText
	#print("\nReplacing url from tweet\n")
	text = replaceURL(text)
	#print(text)

	#print("\nReplacing atuser from tweet\n")
	text = replaceAtUser(text)
	#print(text)

	#print("\nremoving hashtag in tweet\n")
	text = removeHashtag(text)
	#print(text)

	#print("\nreplace at user in tweet\n")
	text = replaceAtUser(text)
	#print(text)

	'''
	#print("\nremoving stopwords\n")
	#nltk.download('stopwords')
	from nltk.corpus import stopwords
	stop = set(stopwords.words('english'))

	d=[]
	d.append([x for x in text.split() if x not in stop])
	d = d[0]
	text = ' '.join(d)
	#print(text)
	'''

	#print("\nremove numbers from tweet\n")
	text = removeNumbers(text)
	#print(text)

	#print("\nremove emoticons from tweet\n")
	text = removeEmoticons(text)
	#print(text)

	#couting multple punctuations
	#print("\ncounting multiple punctuations\n")
	MultiExclMarks = 0
	MultiQuesMarks = 0
	MultiStopMarks = 0

	MultiExclMarks += countMulExcl(text)
	MultiQuesMarks += countMulQues(text)
	MultiStopMarks += countMulStop(text)

	#print(MultiExclMarks,MultiQuesMarks,MultiStopMarks)

	#print("\nremove multiexclamations from tweet\n")
	text = replaceMulExcl(text)
	#print(text)

	#print("\nremove multiquestionmarks from tweet\n")
	text = replaceMulQues(text)
	#print(text)

	#print("\nremove multistopmarks from tweet\n")
	text = replaceMulStop(text)
	#print(text)


	#print("\nshortening elongated words\n")
	totalElongated = 0
	totalElongated += countElongated(text)
	#print(totalElongated)

	regex1 = re.compile(r"(.)\1{2}")
	l=[]
	for word in text.split():
		if(regex1.search(word)):
			new_word = replaceElongated(word)
			##print(new_word)
			l.append(new_word)
		else:
			l.append(word)
	text = ' '.join(l)
	#print(text)

	#print("\nRemoving punctuations except ?!\n")
	text = removePuncts(text)
	#print(text)
	
	#print("\nexpanding slangs in tweet\n")
	text = replaceSlang(text)
	#print(text)

	#print("\nreplace contractions in tweet\n")
	text = replaceContraction(text)
	#print(text)
	
	#print(\nTokenizing the text\n")
	text = word_tokenize(text)
	
	#print("\nLemmatizing the text\n")
	lemma = WordNetLemmatizer()
	
	list1 = []
	for txt in text:
		list1.append(lemma.lemmatize(txt))
		
	return list1

In [3]:
import pandas as pd
import gensim

df = pd.read_csv("../dataset/train.csv")
df['text'] = df['text'].apply(k_prep)
print(df['text'])

0       [Our, Deeds, are, the, Reason, of, this, earth...
1           [Forest, fire, near, La, Ronge, Sask, Canada]
2       [All, resident, asked, to, shelter, in, place,...
3       [people, receive, wildfire, evacuation, order,...
4       [Just, got, sent, this, photo, from, Ruby, Ala...
                              ...                        
7608    [Two, giant, crane, holding, a, bridge, collap...
7609    [<, user, >, <, user, >, The, out, of, control...
7610    [M, UTC, ?, km, S, of, Volcano, Hawaii, <, url...
7611    [Police, investigating, after, an, e, bike, co...
7612    [The, Latest, More, Homes, Razed, by, Northern...
Name: text, Length: 7613, dtype: object


In [4]:
# importing the w2c model
# use this or below one
w2c = gensim.models.KeyedVectors.load_word2vec_format('../../../temp/GoogleNews-vectors-negative300.bin', binary=True)
w2c['go']

array([-2.63671875e-02,  6.83593750e-02, -3.11279297e-02,  2.19726562e-01,
        3.41796875e-03, -9.03320312e-03,  1.07910156e-01, -1.74804688e-01,
        7.71484375e-02,  3.83377075e-04, -1.02539062e-01, -1.73339844e-02,
       -3.08837891e-02,  5.76171875e-02, -1.09863281e-01,  6.10351562e-02,
        2.48046875e-01,  5.46264648e-03,  3.49121094e-02,  7.65991211e-03,
       -1.07910156e-01,  2.16796875e-01,  1.26953125e-01,  1.46484375e-01,
        1.55273438e-01,  4.46777344e-02,  7.51953125e-02, -1.45507812e-01,
       -7.71484375e-02, -8.54492188e-02, -1.15966797e-02,  7.95898438e-02,
       -1.94335938e-01, -2.57812500e-01, -9.86328125e-02, -1.38671875e-01,
       -4.13894653e-04, -9.03320312e-02,  7.66601562e-02,  1.33789062e-01,
        5.17578125e-02, -4.80957031e-02,  1.88476562e-01, -3.44238281e-02,
       -7.95898438e-02,  3.54003906e-02, -1.02539062e-01, -1.33789062e-01,
       -1.06445312e-01,  7.42187500e-02, -2.46582031e-02,  1.99218750e-01,
        1.54296875e-01, -

In [6]:
def create_list(n):
    import random
    randomlist = []
    for i in range(0,n):
        n = random.uniform(-2,2)
        randomlist.append(n)
    return randomlist

In [7]:
def create_number(num):
    mylist = []
    for i in range(1,num):
        mylist.append(str(i))
    return mylist

In [9]:
new_df = pd.DataFrame()
final_df = []
missed = []
l = []
for index, row in df.iterrows():
    string1 = row['text']
    #l.append(string1)
    try:
        result = document_vector(w2c, string1)
        #print(result)
        l.append(list(result))
        #print(l)
        #final_df.append(l)
        #a_series = pd.Series(l)
        #print(l)

        #new_df = new_df.append(a_series,ignore_index=True)
    except Exception as e:
        list1 = [
            0.69671315,
            0.049782764,
            -0.24523668,
            -0.15872465,
            -0.0665417,
            0.20241983,
            0.077576466,
            1.9189811,
            -0.006817713,
            0.06951927,
            -0.4152605,
            -0.89838743,
            -3.7327635,
            -0.049629666,
            -0.7617386,
            -0.5561758,
            -0.9451503,
            0.035365578,
            -1.0393411,
            0.2922259,
            -0.16664228,
            -0.46666014,
            0.35039356,
            0.40681368,
            0.38142973,
            ]
        l.append(create_list(300))
        #missed.append(str(index) + ' ')
        #print(e)
        #final_df.append(l)
        #a_series = pd.Series(l)
        #new_df.append(a_series,ignore_index=True)
print(l[1])

[-1.9955629698771151, -0.059465496674222074, -1.3772573929282714, -0.9162817377290904, -0.5864074138268309, -1.12098597283791, -1.2964507428226644, -1.3732430778289912, 1.7004484486710552, 0.8349857660118722, -1.3973533236125109, 0.6151574203525043, -1.259377136370777, 0.5291990060541827, 0.010753485961588538, 1.6160342961287024, -0.4785420144794177, -0.2870644099209043, 1.745227265232122, -1.3230691306694249, 1.7002480931522963, -1.8504092756603416, 0.6915806789962904, -0.15741799440991988, 1.3166136825602153, -1.1252358108935732, 1.331560599641505, -0.14016120329955806, 0.169990806062982, 0.7653422985663658, -0.9844677153823937, 0.9378378044277453, -0.34353792052422616, 0.6325179713889857, -0.184895178231403, 0.04733398706050407, 0.8409083473158123, 0.7983340560685783, -1.4593710156864188, 0.7819109577438361, -0.28920044958976643, 1.8792492543951322, 1.4950140703224748, 1.7315471773943782, 0.24437031454187297, 0.3363784595429, -0.6742082507145746, 0.9422865473545983, -1.9873643809929

In [31]:
l = np.array(l)
print(l)
print(type(l))

[[-0.21765376 -0.93583841  1.04988693 ... -1.8940708   0.51550383
  -1.55306646]
 [-1.99556297 -0.0594655  -1.37725739 ...  1.6024647  -1.89480814
   1.53767603]
 [ 0.13256966 -0.6736362  -1.75572397 ... -0.94742344  1.93814372
   1.46186149]
 ...
 [ 0.88892467  1.79208064  0.12857314 ...  1.16218543  0.02861918
   0.02193655]
 [-1.41602923 -0.53813091  1.53977083 ...  0.73687115  0.91999201
   0.94353719]
 [-0.72580542  0.75384585 -0.76340768 ...  0.48940175  0.91176122
   1.67633513]]
<class 'numpy.ndarray'>


In [36]:
# Create a function that adds 10
add_10 = lambda i: i + 10

# Create a vectorized function
vectorized_add_10 = np.vectorize(add_10)

# Apply function to all elements in matrix
new_l = vectorized_add_10(l)
new_l

array([[ 9.78234624,  9.06416159, 11.04988693, ...,  8.1059292 ,
        10.51550383,  8.44693354],
       [ 8.00443703,  9.9405345 ,  8.62274261, ..., 11.6024647 ,
         8.10519186, 11.53767603],
       [10.13256966,  9.3263638 ,  8.24427603, ...,  9.05257656,
        11.93814372, 11.46186149],
       ...,
       [10.88892467, 11.79208064, 10.12857314, ..., 11.16218543,
        10.02861918, 10.02193655],
       [ 8.58397077,  9.46186909, 11.53977083, ..., 10.73687115,
        10.91999201, 10.94353719],
       [ 9.27419458, 10.75384585,  9.23659232, ..., 10.48940175,
        10.91176122, 11.67633513]])

In [15]:
test = pd.read_csv('../dataset/test.csv')
x_test = test.text.values
x_test

array(['Just happened a terrible car crash',
       'Heard about #earthquake is different cities, stay safe everyone.',
       'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all',
       ..., 'Green Line derailment in Chicago http://t.co/UtbXLcBIuY',
       'MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3',
       '#CityofCalgary has activated its Municipal Emergency Plan. #yycstorm'],
      dtype=object)

In [29]:
labels = df.target.values
print(labels)
print(len(labels))
print(type(labels))

[1 1 1 ... 1 1 1]
7613
<class 'numpy.ndarray'>


In [37]:
from sklearn import preprocessing
from keras.layers import Input,Dense,Embedding,LSTM,Dropout,Activation
from keras.layers import Bidirectional,GlobalMaxPool1D
# Split the data to train and validation
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_l,labels, test_size=0.2)
print(X_train)

[[ 9.11734658 10.1181292   9.33978054 ...  9.77718602 11.30327314
   9.9503195 ]
 [10.5517871  11.73065543  8.49385809 ... 10.37831197  8.35645726
  11.48569914]
 [10.17318796  9.85098199  9.43240414 ... 11.57884388  9.28201197
   8.65954162]
 ...
 [10.19493256 11.08589722  9.70139935 ... 10.30027151  8.73070422
   8.75654313]
 [ 9.60852369  8.58445431 10.67662507 ... 10.28894443  8.80182757
  10.88313309]
 [ 8.04942151 11.74063548 10.04462099 ...  8.0370108  10.30192352
   8.76515417]]


In [38]:
embedded_size = 300
max_features = 10000
maxlen = 300

In [39]:
from keras.models import Model

inp = Input(shape = (maxlen,))
x = Embedding(max_features,embedded_size)(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1,activation = 'sigmoid')(x)
model = Model(inputs = inp,outputs = x)
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

print(model.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 300)          3000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 300, 128)          186880    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17  

In [40]:
# training the model
model.fit(X_train, y_train, batch_size=512, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7faadaccec50>