In [9]:
import re
from functools import partial
from collections import Counter
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

def removeUnicode(text):
	#Removes unicode strings like "\u002c" and "x96"
	text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
	text = re.sub(r'[^\x00-\x7f]',r'',text)
	return text

def replaceURL(text):
	#Replaces url address with "url" 
	text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','<url>',text)
	text = re.sub(r'#([^\s]+)', r'\1', text)
	return text

def replaceAtUser(text):
	#Replaces "@user" with "atUser"
	text = re.sub('@[^\s]+','<user>',text)
	return text

def removeHashtag(text):
	#Removes hastag in front of a word
	text = re.sub(r'#([^\s]+)', r'\1', text)
	return text

def removeNumbers(text):
	#Removes integers
	text = ''.join([i for i in text if not i.isdigit()])         
	return text

def replaceMulExcl(text):
	#Replaces repetitions of exlamation marks
	text = re.sub(r"(\!)\1+", '!', text)
	return text

def replaceMulQues(text):
	#Replaces repetitions of question marks
	text = re.sub(r"(\?)\1+", '?', text)
	return text

def replaceMulStop(text):
	#Replaces repetitions of stop marks
	text = re.sub(r"(\.)\1+", '.', text)
	return text

def countMulExcl(text):
	#count repetitions of exlamation marks
	return len(re.findall(r"(\!)\1+", text))

def countMulQues(text):
	#Count repetitions of question marks
	return len(re.findall(r"(\?)\1+", text))

def countMulStop(text):
	#Count repetitions of stop marks
	return len(re.findall(r"(\.)\1+", text))

def countElongated(text):
	#count of how many words are elongated
	regex = re.compile(r"(.)\1{2}")
	return len([word for word in text.split() if regex.search(word)])

def countAllCaps(text):
	#count of how many words are all caps
	return len(re.findall("[A-Z0-9]{3,}", text))

#Creates a dictionary with slangs and their equivalents and replaces them
with open('slang.txt') as file:
	slang_map = dict(map(str.strip, line.partition('\t')[::2])
	for line in file if line.strip())

slang_words = sorted(slang_map, key=len, reverse=True)
regex = re.compile(r"\b({})\b".format("|".join(map(re.escape, slang_words))))
replaceSlang = partial(regex.sub, lambda m: slang_map[m.group(1)])

#punctuation list for replacing

puncts = [',', '.', '"', ':', ')', '(', '-', '|', ';', "'", '$', '&', '/', '[', ']', '%', '=', '*', '+', '\\', '•',  '~', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def removePuncts(x):
	x = str(x)
	for punct in puncts:
		if punct in x:
			x = x.replace(punct, f' ')
	return x



def countSlang(text):
	# counts how many slang words and a list of found slangs
	slangCounter = 0
	slangsFound = []
	tokens = nltk.word_tokenize(text)
	for word in tokens:
		if word in slang_words:
			slangsFound.append(word)
			slangCounter += 1
	return slangCounter, slangsFound

#Replaces contractions from a string to their equivalents
contraction_patterns = [ (r'I\'m', 'I am'),(r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
						 (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
def replaceContraction(text):
	patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
	for (pattern, repl) in patterns:
		(text, count) = re.subn(pattern, repl, text)
	return text

def replaceElongated(word):
	#Replaces an elongated word with its basic form

	repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
	repl = r'\1\2\3'
	if wordnet.synsets(word):
		return word
	repl_word = repeat_regexp.sub(repl, word)
	if repl_word != word:      
		return replaceElongated(repl_word)
	else:       
		return repl_word

def removeEmoticons(text):
	#Removes emoticons from text 
	text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', text)
	return text

def countEmoticons(text):
	#Input: a text, Output: how many emoticons
	return len(re.findall(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', text))


### Spell Correction begin ###
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('spell_correction.txt').read()))

def P(word, N=sum(WORDS.values())): 
	#P robability of `word`.
	return WORDS[word] / N

def spellCorrection(word): 
	#Most probable spelling correction for word.
	return max(candidates(word), key=P)

def candidates(word): 
	#Generate possible spelling corrections for word.
	return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
	#The subset of `words` that appear in the dictionary of WORDS.
	return set(w for w in words if w in WORDS)

def edits1(word):
	#All edits that are one edit away from `word`.
	letters    = 'abcdefghijklmnopqrstuvwxyz'
	splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
	deletes    = [L + R[1:]               for L, R in splits if R]
	transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
	replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
	inserts    = [L + c + R               for L, R in splits for c in letters]
	return set(deletes + transposes + replaces + inserts)

def edits2(word): 
	#All edits that are two edits away from `word`.
	return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### Spell Correction End ###

In [10]:
import pandas as pd
import numpy as np
import string
from time import time
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 


def k_prep(inputText = 'null'):
	'''
	c = input("\n1.Enter or 2.default or 3.df ? :")

	if c == "2":
		text = "AFRICA,#AFRICANBAZE: asap goaaaaal @Breaking !!!! news:Nigeria oooooooh :-D aren't flag???? wont set ablaze..... 12000 in America. http://t.co/2nndBGwyEi,1"
		#print(text)
	
	elif c == "3" :
		text = inputText

	else:
		text = input("\nEnter the tweet: ")
	'''

	text = inputText
	#print("\nReplacing url from tweet\n")
	text = replaceURL(text)
	#print(text)

	#print("\nReplacing atuser from tweet\n")
	text = replaceAtUser(text)
	#print(text)

	#print("\nremoving hashtag in tweet\n")
	text = removeHashtag(text)
	#print(text)

	#print("\nreplace at user in tweet\n")
	text = replaceAtUser(text)
	#print(text)

	'''
	#print("\nremoving stopwords\n")
	#nltk.download('stopwords')
	from nltk.corpus import stopwords
	stop = set(stopwords.words('english'))

	d=[]
	d.append([x for x in text.split() if x not in stop])
	d = d[0]
	text = ' '.join(d)
	#print(text)
	'''

	#print("\nremove numbers from tweet\n")
	text = removeNumbers(text)
	#print(text)

	#print("\nremove emoticons from tweet\n")
	text = removeEmoticons(text)
	#print(text)

	#couting multple punctuations
	#print("\ncounting multiple punctuations\n")
	MultiExclMarks = 0
	MultiQuesMarks = 0
	MultiStopMarks = 0

	MultiExclMarks += countMulExcl(text)
	MultiQuesMarks += countMulQues(text)
	MultiStopMarks += countMulStop(text)

	#print(MultiExclMarks,MultiQuesMarks,MultiStopMarks)

	#print("\nremove multiexclamations from tweet\n")
	text = replaceMulExcl(text)
	#print(text)

	#print("\nremove multiquestionmarks from tweet\n")
	text = replaceMulQues(text)
	#print(text)

	#print("\nremove multistopmarks from tweet\n")
	text = replaceMulStop(text)
	#print(text)


	#print("\nshortening elongated words\n")
	totalElongated = 0
	totalElongated += countElongated(text)
	#print(totalElongated)

	regex1 = re.compile(r"(.)\1{2}")
	l=[]
	for word in text.split():
		if(regex1.search(word)):
			new_word = replaceElongated(word)
			##print(new_word)
			l.append(new_word)
		else:
			l.append(word)
	text = ' '.join(l)
	#print(text)

	#print("\nRemoving punctuations except ?!\n")
	text = removePuncts(text)
	#print(text)
	
	#print("\nexpanding slangs in tweet\n")
	text = replaceSlang(text)
	#print(text)

	#print("\nreplace contractions in tweet\n")
	text = replaceContraction(text)
	#print(text)
	
	#print(\nTokenizing the text\n")
	text = word_tokenize(text)
	
	#print("\nLemmatizing the text\n")
	lemma = WordNetLemmatizer()
	
	list1 = []
	for txt in text:
		list1.append(lemma.lemmatize(txt))
		
	return list1

In [11]:
import pandas as pd
import gensim

df = pd.read_csv("../dataset/train.csv")
df['text'] = df['text'].apply(k_prep)
print(df['text'])

0       [Our, Deeds, are, the, Reason, of, this, earth...
1           [Forest, fire, near, La, Ronge, Sask, Canada]
2       [All, resident, asked, to, shelter, in, place,...
3       [people, receive, wildfire, evacuation, order,...
4       [Just, got, sent, this, photo, from, Ruby, Ala...
                              ...                        
7608    [Two, giant, crane, holding, a, bridge, collap...
7609    [<, user, >, <, user, >, The, out, of, control...
7610    [M, UTC, ?, km, S, of, Volcano, Hawaii, <, url...
7611    [Police, investigating, after, an, e, bike, co...
7612    [The, Latest, More, Homes, Razed, by, Northern...
Name: text, Length: 7613, dtype: object


# Word2Vec model

In [12]:
# importing the w2c model
# use this or below one
w2c = gensim.models.KeyedVectors.load_word2vec_format('../../../temp/GoogleNews-vectors-negative300.bin', binary=True)
w2c['go']

array([-2.63671875e-02,  6.83593750e-02, -3.11279297e-02,  2.19726562e-01,
        3.41796875e-03, -9.03320312e-03,  1.07910156e-01, -1.74804688e-01,
        7.71484375e-02,  3.83377075e-04, -1.02539062e-01, -1.73339844e-02,
       -3.08837891e-02,  5.76171875e-02, -1.09863281e-01,  6.10351562e-02,
        2.48046875e-01,  5.46264648e-03,  3.49121094e-02,  7.65991211e-03,
       -1.07910156e-01,  2.16796875e-01,  1.26953125e-01,  1.46484375e-01,
        1.55273438e-01,  4.46777344e-02,  7.51953125e-02, -1.45507812e-01,
       -7.71484375e-02, -8.54492188e-02, -1.15966797e-02,  7.95898438e-02,
       -1.94335938e-01, -2.57812500e-01, -9.86328125e-02, -1.38671875e-01,
       -4.13894653e-04, -9.03320312e-02,  7.66601562e-02,  1.33789062e-01,
        5.17578125e-02, -4.80957031e-02,  1.88476562e-01, -3.44238281e-02,
       -7.95898438e-02,  3.54003906e-02, -1.02539062e-01, -1.33789062e-01,
       -1.06445312e-01,  7.42187500e-02, -2.46582031e-02,  1.99218750e-01,
        1.54296875e-01, -

In [5]:
def create_number(num):
    mylist = []
    for i in range(1,num):
        mylist.append(str(i))
    return mylist



In [6]:
def create_list(n):
    import random
    randomlist = []
    for i in range(0,n):
        n = random.uniform(-2,2)
        randomlist.append(n)
    return randomlist

In [None]:
# using glove instead fo w2c
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models.keyedvectors import KeyedVectors
#glove_file = datapath('../../../temp/glove/glove.twitter.27B.25d.txt')
#tmp_file = get_tmpfile("test_word2vec.txt")
#glove2word2vec(glove_file, tmp_file)
glove2word2vec(glove_input_file="../../../temp/glove/glove.twitter.27B.200d.txt", word2vec_output_file="gensim_glove_vectors.txt")
w2c = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt")
w2c['go']

# Returns mean of w2v value

In [7]:

def document_vector(model,doc):
	#filename = 'glove.twitter.27B.25d.txt.word2vec'
	#model = KeyedVectors.load_word2vec_format(filename, binary=False)
	# remove out-of-vocabulary words
	doc = [word for word in doc if word in model.vocab]
	return np.mean(model[doc], axis=0)

In [13]:
new_df = pd.DataFrame()
final_df = []
missed = []
l = []
for index, row in df.iterrows():
    string1 = row['text']
    #l.append(string1)
    try:
        result = document_vector(w2c, string1)
        #print(result)
        l.append(list(result))
        #print(l)
        #final_df.append(l)
        #a_series = pd.Series(l)
        #print(l)

        #new_df = new_df.append(a_series,ignore_index=True)
    except Exception as e:
        list1 = [
            0.69671315,
            0.049782764,
            -0.24523668,
            -0.15872465,
            -0.0665417,
            0.20241983,
            0.077576466,
            1.9189811,
            -0.006817713,
            0.06951927,
            -0.4152605,
            -0.89838743,
            -3.7327635,
            -0.049629666,
            -0.7617386,
            -0.5561758,
            -0.9451503,
            0.035365578,
            -1.0393411,
            0.2922259,
            -0.16664228,
            -0.46666014,
            0.35039356,
            0.40681368,
            0.38142973,
            ]
        l.append(create_list(300))
        #missed.append(str(index) + ' ')
        #print(e)
        #final_df.append(l)
        #a_series = pd.Series(l)
        #new_df.append(a_series,ignore_index=True)
print(l[1])
len(l[:2])

NameError: name 'create_list' is not defined

In [9]:
# Create the pandas DataFrame 
label_df = df['target']
result_df = pd.DataFrame(l, columns = create_number(301))
#print(result_df)
print(label_df)

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64


In [10]:
print(result_df)

             1         2         3         4         5         6         7  \
0     0.021657  0.078870  0.076080  0.082520 -0.118947  0.011986 -0.007594   
1     0.077427  0.018101  0.116699  0.069196  0.036717 -0.041347 -0.107396   
2    -0.014340 -0.006574  0.084782 -0.012732 -0.063611 -0.023063  0.054099   
3     0.177699 -0.047241  0.004203  0.100237  0.032628 -0.004854 -0.087019   
4     0.086696  0.004797 -0.040091  0.059727 -0.036028 -0.023974 -0.019771   
...        ...       ...       ...       ...       ...       ...       ...   
7608 -0.044428 -0.030121 -0.004439  0.026434  0.019830 -0.139926  0.021290   
7609  0.000160  0.034534  0.028207  0.069690 -0.031021 -0.015558 -0.004175   
7610 -0.170639 -0.028827  0.004154  0.250061  0.020477 -0.069992 -0.123230   
7611  0.041909  0.047636  0.053714 -0.044213 -0.061146  0.001296  0.023490   
7612  0.012226 -0.009672 -0.006827 -0.012755  0.018559  0.037983  0.033324   

             8         9        10  ...       291       292    

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(result_df,label_df, test_size=0.35)

In [12]:
# Starting neural network

In [13]:
# declaring variables
vocab_size = 10000
embedding_dim = 16
max_length = 30 
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [15]:
#2nd method
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(X_train)
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Embedding layer

In [5]:
#Creating embedding layer for testing neural network
import tensorflow as tf
vocab_size = 270000000000000
embedding_dim = 300
max_length = 300

model = tf.keras.Sequential([
     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
     tf.keras.layers.Flatten(),
     tf.keras.layers.Dense(6, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')
])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          8100000000
_________________________________________________________________
flatten (Flatten)            (None, 90000)             0         
_________________________________________________________________
dense (Dense)                (None, 6)                 540006    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 81,000,000,000,540,013
Trainable params: 81,000,000,000,540,013
Non-trainable params: 0
_________________________________________________________________


In [7]:
#adding optimizer and loss functions for the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',tf.keras.metrics.AUC()])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [8]:
num_epochs = 100
history = model.fit(X_train, y_train, epochs=num_epochs)

NameError: name 'X_train' is not defined

# Convolution layer

In [25]:
model2 = tf.keras.Sequential([
     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
     tf.keras.layers.Conv1D(128, 5, activation='relu'),
     tf.keras.layers.GlobalAveragePooling1D(),
     tf.keras.layers.Dense(6, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')
])

In [26]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 16)           160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 296, 128)          10368     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 774       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 7         
Total params: 171,149
Trainable params: 171,149
Non-trainable params: 0
_________________________________________________________________


In [27]:
#adding optimizer and loss functions for the model
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',tf.keras.metrics.AUC()])

In [30]:
num_epochs = 10
history = model.fit(X_train, y_train, epochs=num_epochs)

Train on 4948 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# LSTM 

In [33]:
model3 = tf.keras.Sequential([
     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
     tf.keras.layers.Dense(64, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')
    ])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [34]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 300, 16)           160000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 296, 128)          10368     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 774       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 7         
Total params: 171,149
Trainable params: 171,149
Non-trainable params: 0
_________________________________________________________________


In [35]:
#adding optimizer and loss functions for the model
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',tf.keras.metrics.AUC()])

In [36]:
num_epochs = 10
history = model.fit(X_train, y_train, epochs=num_epochs)

Train on 4948 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
