In [None]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import json
import copy
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
cols = ['id','text','sentiment']
train = pd.read_csv("agr_en_train.csv",header=None, names=cols)
train.drop(['id'],axis=1,inplace=True)
train['sentiment'] = train['sentiment'].map({'OAG': 0, 'CAG': 1,'NAG': 2})

In [None]:
cols = ['id','text','sentiment']
dev = pd.read_csv("agr_en_dev.csv",header=None, names=cols)
dev.drop(['id'],axis=1,inplace=True)
dev['sentiment'] = dev['sentiment'].map({'OAG': 0, 'CAG': 1,'NAG': 2})

In [None]:
my_df = train.append(dev)

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils
import re
from nltk.corpus import stopwords
import keras
import keras.preprocessing.text as kpt

In [None]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.segmenter import Segmenter
from ekphrasis.classes.spellcorrect import SpellCorrector
import nltk

puncttok = nltk.WordPunctTokenizer().tokenize

sp = SpellCorrector(corpus="english") 


seg_tw = Segmenter(corpus="twitter")

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [None]:
FLAGS = re.MULTILINE | re.DOTALL

def tokenize(text):
    # Different regex parts for smiley faces
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    # function so code less repetitive
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)
    
    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", " url ")
    #text = re_sub(r"#(\S+)", r"\1") # replace #name with name
    text = re_sub(r"(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))", " em_positive ") # Smile -- :), : ), :-), (:, ( :, (-:, :')
    text = re_sub(r"(:\s?D|:-D|x-?D|X-?D)", " em_positive ") # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    text = re_sub(r"(<3|:\*)", " em_positive ") # Love -- <3, :*
    text = re_sub(r"(;-?\)|;-?D|\(-?;)", " em_positive ") # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    text = re_sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', " em_negative ") # Sad -- :-(, : (, :(, ):, )-:
    text = re_sub(r'(:,\(|:\'\(|:"\()', " em_negative ") # Cry -- :,(, :'(, :"(
    text = re_sub(r"(.)\1+", r"\1\1") # remove funnnnny --> funny
    text = re_sub(r"(-|\')", "") # remove &
    #text = re_sub(r"/"," / ")
    text = re_sub(r"@[0-9]+-", " number ")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), " em_positive ")
    text = re_sub(r"{}{}p+".format(eyes, nose), " em_positive ")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), " em_negative ")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), " em_neutralface ")
    #text = re_sub(r"<3"," heart ")
    #text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", " ")
    #text = re_sub(r"#\S+", hashtag)
    #text = re_sub(r"([!?.]){2,}", r" \1 ")
    #text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 ")
    #text = re_sub(r"([A-Z]){2,}", allcaps)
    #text = re_sub(r'([\w!.,?();*\[\]":\”\“])([!.,?();*\[\]":\”\“])', r'\1 \2')
    #text = re_sub(r'([!.,?();*:\[\]":\”\“])([\w!.,?();*\[\]":\”\“])', r'\1 \2')
    #text = re_sub(r'(.)(<)', r'\1 \2')
    #text = re_sub(r'(>)(.)', r'\1 \2')
    #text = re_sub(r'[\'\`\’\‘]', r'')
    #text = re_sub(r'\\n', r' ')
    text = re_sub(r'-', r' ')
    #text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", " url ")
    text = re_sub(r"([pls?s]){2,}", r"\1")
    text = re_sub(r"([plz?z]){2,}", r"\1")
    text = re_sub(r'\\n', r' ')
    #text = re_sub(r"<3","love")
    text = re_sub(r" sx "," sex ")
    text = re_sub(r" u "," you ")
    text = re_sub(r" r "," are ")
    text = re_sub(r" y "," why ")
    text = re_sub(r" Y "," WHY ")
    text = re_sub(r"Y "," WHY ")
    text = re_sub(r" hv "," have ")
    text = re_sub(r" c "," see ")
    text = re_sub(r" bcz "," because ")
    text = re_sub(r" coz "," because ")
    text = re_sub(r" v "," we ")
    text = re_sub(r" ppl "," people ") 
    text = re_sub(r" pepl "," people ")
    text = re_sub(r" r b i "," rbi ")
    text = re_sub(r" R B I "," RBI ")
    text = re_sub(r" R b i "," rbi ")
    text = re_sub(r" R "," ARE ")
    text = re_sub(r" hav "," have ")
    text = re_sub(r"R "," ARE ")
    text = re_sub(r" U "," you ")
    text = re_sub(r" 👎 "," OAG ")
    text = re_sub(r"U "," you ")
    text = re_sub(r" pls "," please ")
    text = re_sub(r"Pls ","Please ")
    text = re_sub(r"plz ","please ")
    text = re_sub(r"Plz ","Please ")
    text = re_sub(r"PLZ ","Please ")
    text = re_sub(r"Pls","Please ")
    text = re_sub(r"plz","please ")
    text = re_sub(r"Plz","Please ")
    text = re_sub(r"PLZ","Please ") 
    text = re_sub(r" thankz "," thanks ")
    text = re_sub(r" thnx "," thanks ")
    text = re_sub(r"fuck\w+ "," fuck ")
    text = re_sub(r"f\*\* "," fuck ")
    text = re_sub(r"\*\*\*k "," fuck ")
    text = re_sub(r"F\*\* "," fuck ")
    text = re_sub(r"mo\*\*\*\*\* "," fucker ")
    text = re_sub(r"b\*\*\*\* "," blody ")
    text = re_sub(r" mc "," fucker ")
    text = re_sub(r" MC "," fucker ")
    text = re_sub(r" wtf "," fuck ")
    text = re_sub(r" ch\*\*\*ya "," fucker ")
    text = re_sub(r" ch\*\*Tya "," fucker ")
    text = re_sub(r" ch\*\*Tia "," fucker ")
    text = re_sub(r" C\*\*\*yas "," fucker ")
    text = re_sub(r"l\*\*\*\* ","shit ")
    text = re_sub(r" A\*\*\*\*\*\*S"," ASSHOLES")
    text = re_sub(r" di\*\*\*\*s"," cker")
    text = re_sub(r" nd "," and ")
    text = re_sub(r"Nd ","and ")
    text = re_sub(r"([!?!]){2,}", r"! ")
    text = re_sub(r"([.?.]){2,}", r". ")
    text = re_sub(r"([*?*]){2,}", r"* ")
    text = re_sub(r"([,?,]){2,}", r", ")
    text = re_sub(r"([!]){2,}", r"! ")
    text = re_sub(r"([.]){2,}", r". ")
    text = re_sub(r"([*]){2,}", r"* ")
    text = re_sub(r"([,]){2,}", r", ")
    text = re_sub(r"\n\r", " ")
    text = re_sub(r"(ind[vs]pak)", " india versus pakistan ")
    text = re_sub(r"(pak[vs]ind)", " pakistan versus india ")
    text = re_sub(r"(indvsuae)", " india versus United Arab Emirates ")
    text = re_sub(r"[sS]hut[Dd]own[jnuJNU]", " shut down jnu ")
    #text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", " number ")
    return text

def stem(word):
		regexp = r'^(.*?)(ing)?$'
		stem, suffix = re.findall(regexp, word)[0]
		return stem

my_word_stop = ['the','in','of','is','a','to','an','be','are','for','he','she','we','was','it','as','on']
    
def tokenize_data(data):
    tokenized_data = []
    for i in range(data.shape[0]):
        filtered_words = ' '.join([stem(word) for word in data[i].split(" ") if word not in my_word_stop])
        filtered_words = ' '.join([word for word in data[i].split(",")])
        filtered_words = ' '.join([word for word in data[i].split("!")])
        filtered_words = ' '.join([word for word in data[i].split("#")])
        filtered_words = ' '.join([word for word in data[i].split() if word not in my_word_stop])
        filtered_words = ' '.join([stem(word) for word in data[i].split()])
        filtered_words = ' '.join([stem(word) for word in data[i].split() if word not in (stopwords.words('my_english_words'))])
        #tokens = tokenize(filtered_words)
        tokenized_data.append(filtered_words)
    return tokenized_data


In [None]:
ls = []
Xs = []
ls = tokenize_data(my_df.text)

In [None]:
for row in ls:
    Xs.append(text_processor.pre_process_doc(row))

In [None]:
import csv
count = 0
with open('segmentation_train_dev.csv', 'w') as f:
    writer = csv.writer(f)
    for row in Xs:        
        writer.writerow([row])
        count = count +1
f.close()
print(count)

In [None]:
train = pd.read_csv("segmentation_fb_train_dev.csv",header=None)

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
train[0] = tokenize_data(train[0])
train_x = train[0]

In [None]:
train_y = my_df.sentiment

In [None]:
# create a new Tokenizer
tokenizer = Tokenizer(lower=False,filters='')
# feed our posts to the Tokenizer
tokenizer.fit_on_texts(train_x)

In [None]:
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

#Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index
# Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    temp_wordIndices = []
    for word in kpt.text_to_word_sequence(text,filters='',lower=False):
        if word in dictionary:
            temp_wordIndices.append(dictionary[word])
    return temp_wordIndices

allWordIndices = []
# for each post, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all posts converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed posts
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
# treat the labels as categories
train_y = keras.utils.to_categorical(train_y, 3)

In [None]:
# word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(dictionary))

In [None]:
train_x.shape[1]

In [None]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation , LSTM , Input , Embedding
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, concatenate, Activation, Average
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense, Dropout , Bidirectional
from keras.layers import Flatten , LSTM , Reshape
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.regularizers import L1L2
from keras import optimizers
from keras.callbacks import CSVLogger

In [None]:
model = Sequential()

model.add(Dense(150, activation='relu',input_shape=(train_x.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(300, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

print(model.summary())

filepath="sequencing_the_data_try_n_error.{epoch:02d}-{val_loss:.4f}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
csv_logger = CSVLogger('final_log.csv', append=True, separator=';')

model.fit(train_x, train_y,
    batch_size=50,
    epochs=2,
    verbose=1,
    validation_split=0.2,
    shuffle=True,callbacks = [csv_logger])

model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')

print('saved model!')

In [None]:
print(train_x.shape)

# loading

In [None]:
cols = ['id','text']
my_dev = pd.read_csv("agr_en_sm_test.csv",header=None, names=cols)
#my_dev.drop(['id'],axis=1,inplace=True)

In [None]:
cs = []
Zs = []
cs = tokenize_data(my_dev.text)

In [None]:
for row in cs:
    Zs.append(text_processor.pre_process_doc(row))

In [None]:
import csv
count = 0
with open('segmentation_test.csv', 'w') as f:
    writer = csv.writer(f)
    for row in Zs:        
        writer.writerow([row])
        #filtered_words = ([word for word in row])
        #write(str(filtered_words)+"\n")
        count = count +1
f.close()
print(count)

In [None]:
cols = ['text']
my_dev = pd.read_csv("segmentation_fb_test.csv",header=None)
#my_dev = pd.read_csv("segmentation_fb_test.csv")

In [None]:
len(my_dev)

In [None]:
my_dev[0] = my_dev[0].astype('str')

In [None]:
import json
import numpy as np
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json

# we're still going to use a Tokenizer here, but we don't need to fit it
tokenizer = Tokenizer(num_words=train_x.shape[1])
# for human-friendly printing
labels = ['OAG','CAG','NAG']

# read in our saved dictionary
with open('dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)

# this utility makes sure that all the words in your input
# are registered in the dictionary
# before trying to turn them into a matrix.
not_found_word_list = []
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text,filters='',lower=False)
    wordIndices = []
    no_word = 0
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            #print("'%s' not in training corpus; ignoring." %(word))
            not_found_word_list.append(word)
            no_word = no_word + 1
    return wordIndices,no_word

# read in your saved model structure
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
# and create a model from that
model = model_from_json(loaded_model_json)
# and weight your nodes with your saved values
model.load_weights('model.h5')
with open('fileName.csv', 'w') as f:
    count=0
    no_words = 0
    for row in my_dev[0]:
        # okay here's the interactive part
        evalSentence = row
        # format your input for the neural net
        testArr,no_word = convert_text_to_index_array(evalSentence)
        input = tokenizer.sequences_to_matrix([testArr], mode='binary')
        # predict which bucket your input belongs in
        pred = model.predict(input)
        # and print it for the humons
        f.write(labels[np.argmax(pred)] + "\n")
        #f.write(pred + "\n")
        count+=1
        no_words+=no_word
f.close()
print(count)
print("word not found : ", no_words)
with open('not_found_word_list.csv', 'w') as f:
    for word in not_found_word_list:
        f.write(str(word)+"\n")
f.close()