In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
print("import done")

Using TensorFlow backend.


import done


In [2]:
#https://github.com/debadridtt/A-Review-of-Different-Word-Embeddings-for-Sentiment-Classification-using-Deep-Learning/blob/master/LSTM%20Experiment.ipynb

In [3]:
DATA_FILE = 'C:/Users/khmar/Desktop/ISSUE/dataset/CSV/data_ameliorate/data.csv'
df = pd.read_csv(DATA_FILE,delimiter=';',encoding='UTF-8')
print(df.head())

                                                text     label
0  I had ordered a data cable, got a very well fi...  NOTISSUE
1                                   Love This Phone.  NOTISSUE
2                I get a very well finished product.  NOTISSUE
3                            I could not be happier.  NOTISSUE
4  I was looking for this headset for a long time...  NOTISSUE


In [4]:
df['label'].value_counts() #imbalanced Dataset


NOTISSUE    2030
ISSUE       2025
Name: label, dtype: int64

In [5]:
df.shape

(4055, 2)

###  Preprocessing the Data


In [6]:
import nltk                      # the natural langauage toolkit, open-source NLP
import gensim
from nltk.corpus import stopwords  
from gensim import parsing
import re# Help in preprocessing the data, very efficiently
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khmar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
STOP_WORDS ={
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'nobody',
 'none',
 'noone',
 'nor',
 'nothing',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'she',
 'should',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'we',
 'well',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [11]:
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import parsing
from gensim.parsing.preprocessing import split_alphanum
from spellchecker import SpellChecker
import re


def normaliser_word(word):
    slangs_dict = {
    'awsm': 'awesome',
    "aamof": "as a matter of fact",
    "abt": "about",
    "abt2": "about to",
    "ac": "air conditioning",
    "ace": "solo winner",
    "ack": "acknowledged",
    "admin": "administrator",
    "thr": "there",
    "frm": "from",
    "aggro": "aggression",
    "agl": "angel",
    "dob": "date of birth",
    "ai": "artificial intelligence",
    "aiic": "as if i care",
    "aka": "also known as",
    "alap": "as long as possible",
    "alol": "actually laughing out loud",
    "ama": "ask me anything",
    "amap": "as much as possible",
    "amazn": "amazing",
    "ammo": "ammunition",
    "ams": "ask me something",
    "anon": "anonymous",
    "asap": "as soon as possible",
    "asat": "as simple as that",
    "awks": "awkward",
    "awl": "always with love",
    "ayk": "as you know",
    "azm": "awesome",
    "b": "be",
    "b&w": "black and white",
    "b-day": "birthday",
    "bday": "birthday",
    "bcoz": "because",
    "bcos": "because",
    "bcz": "because",
    "bf": "boyfriend",
    "btw": "between",
    "b4": "before",
    "bai": "bye",
    "bb": "bye bye",
    "bc": "abuse",
    "mc": "abuse",
    "bcc": "blind carbon copy",
    "bff": "best friends forever",
    "biz": "business",
    "bk": "back",
    "bo": "back off",
    "bro": "brother",
    "btwn": "between",
    "c": "see",
    "char": "character",
    "combo": "combination",
    "cu": "see you",
    "cu2": "see you too",
    "cu2mr": "see you tomorrow",
    "cya": "see ya",
    "cyal8r": "see you later",
    "cyb": "call you back",
    "cye": "check your e-mail",
    "cyf": "check your facebook",
    "cyfb": "check your facebook",
    "cyl": "catch ya later",
    "cym": "check your myspace",
    "cyo": "see you online",
    "d8": "date",
    "da": "the",
    "dece": "decent",
    "ded": "dead",
    "dept": "department",
    "dis": "this",
    "ditto": "same",
    "diva": "rude woman",
    "dk": "don't know",
    "dlm": "don't leave me",
    "dm": "direct message",
    "dnd": "do not disturb",
    "dno": "don't know",
    "dnt": "don't",
    "e1": "everyone",
    "eg": "for example",
    "emc2": "genius",
    "emo": "emotional",
    "enuf": "enough",
    "eod": "end of discussion",
    "eof": "end of file",
    "eom": "end of message",
    "eta": "estimated time of arrival",
    "every1": "everyone",
    "evs": "whatever",
    "exp": "experience",
    "f": "female",
    "f2f": "face to face",
    "f2p": "free to play",
    "f2t": "free to talk",
    "f9": "fine",
    "fab": "fabulous",
    "fail": "failure",
    "faq": "frequently asked questions",
    "fav": "favorite",
    "fave": "favorite",
    "favs": "favorites",
    "fb": "facebook",
    "fbc": "facebook chat",
    "fbf": "facebook friend",
    "fml": "family",
    "fn": "fine",
    "fo": "freaking out",
    "fri": "friday",
    "frnd": "friend",
    "fu": "fuck you",
    "fugly": "fucking ugly",
    "gf": "girlfriend",
    "g1": "good one",
    "g2b": "going to bed",
    "g2cu": "good to see you",
    "g2g": "good to go",
    "g4i": "go for it",
    "g4n": "good for nothing",
    "g4u": "good for you",
    "g9": "goodnight",
    "ga": "go ahead",
    "ge": "good evening",
    "gl": "good luck",
    "gm": "good morning",
    "gn": "goodnight",
    "gonna": "going to",
    "goon": "idiot",
    "gorge": "gorgeous",
    "gr8": "great",
    "grats": "congratulations",
    "gratz": "congratulations",
    "grl": "girl",
    "gt2t": "got time to talk",
    "gtg": "good to go",
    "gud": "good",
    "gv": "give",
    "gvn": "given",
    "gw": "good work",
    "h/o": "hold on",
    "h/p": "hold please",
    "h/t": "hat tip",
    "h/u": "hook up",
    "h2cus": "hope to see you soon",
    "h4u": "hot for you",
    "h4x0r": "hacker",
    "h4x0rz": "hackers",
    "h8": "hate",
    "h8r": "hater",
    "h8t": "hate",
    "ha": "hello again",
    "haha": "laughing",
    "hai": "hi",
    "hak": "hugs and kisses",
    "han": "how about now?",
    "hav": "have",
    "hax": "hacks",
    "haxor": "hacker",
    "hay": "how are you",
    "hb2u": "happy birthday to you",
    "hbbd": "happy belated birthday",
    "hbd": "happy birthday",
    "hc": "how cool",
    "hcit": "how cool is that",
    "hehe": "laughing",
    "hf": "have fun",
    "hi5": "high five",
    "hig": "how's it going?",
    "hih": "hope it helps",
    "ho": "hold on",
    "hoc": "house of cards",
    "hof": "hall of fame",
    "holla": "holler",
    "hom": "hit or miss",
    "hood": "neighborhood",
    "hoops": "basketball",
    "hottie": "attractive person",
    "hr": "human resources",
    "hru": "how are you",
    "hry": "hurry",
    "hubby": "husband",
    "hwk": "homework",
    "hwp": "height weight proportionate",
    "hwu": "hey, what's up?",
    "hxc": "hardcore",
    "h^": "hook up",
    "i8": "i ate",
    "i8u": "i hate you",
    "ia": "i agree",
    "iab": "in a bit",
    "iac": "in any case",
    "iad": "it all depends",
    "iae": "in any event",
    "iag": "it's all good",
    "iagw": "in a good way",
    "iail": "i am in love",
    "iam": "in a minute",
    "ic": "i see",
    "id10t": "idiot",
    "idc": "i don't care",
    "idd": "indeed",
    "idi": "i doubt it",
    "idk": "i don't know",
    "idky": "i don't know why",
    "idmb": "i'll do my best",
    "idn": "i don't know",
    "idnk": "i do not know",
    "idr": "i don't remember",
    "idt": "i don't think",
    "idts": "i don't think so",
    "idtt": "i'll drink to that",
    "idu": "i don't understand",
    "ie": "that is",
    "ig2p": "i got to pee",
    "iggy": "ignored",
    "ight": "alright",
    "igi": "i get it",
    "ign": "in-game name",
    "igtp": "i get the point",
    "ih8u": "i hate you",
    "ihu": "i hate you",
    "ihy": "i hate you",
    "ii": "i'm impressed",
    "iiok": "if i only knew",
    "iir": "if i remember",
    "iirc": "if i remember correctly",
    "iit": "i'm impressed too",
    "iiuc": "if i understand correctly",
    "ik": "i know",
    "ikhyf": "i know how you feel",
    "ikr": "i know, right?",
    "ikwum": "i know what you mean",
    "ikwym": "i know what you mean",
    "ikyd": "i know you did",
    "ilu": "i like you",
    "ilu2": "i love you too",
    "ilub": "i love you baby",
    "ilyk": "i'll let you know",
    "ilyl": "i love you lots",
    "ilysm": "i love you so much",
    "ima": "i'm",
    "imma": "i'm gonna",
    "imo": "in my opinion",
    "imy": "i miss you",
    "inb4": "in before",
    "inc": "incoming",
    "indie": "independent",
    "info": "information",
    "init": "initialize",
    "ipo": "initial public offering",
    "ir": "in room",
    "ir8": "irate",
    "irdk": "i really don't know",
    "irl": "in real life",
    "iyo": "in your opinion",
    "iyq": "i like you",
    "j/k": "just kidding",
    "j/p": "just playing",
    "j/w": "just wondering",
    "j2lyk": "just to let you know",
    "j4f": "just for fun",
    "j4g": "just for grins",
    "jas": "just a second",
    "jb/c": "just because",
    "joshing": "joking",
    "k": "ok",
    "k3u": "i love you",
    "kappa": "sarcasm",
    "kek": "korean laugh",
    "keke": "korean laugh",
    "kewl": "cool",
    "kewt": "cute",
    "kfc": "kentucky fried chicken",
    "kgo": "ok, go",
    "kik": "laughing out loud",
    "kinda": "kind of",
    "kk": "ok",
    "kl": "kool",
    "km": "kiss me",
    "kma": "kiss my ass",
    "knp": "ok, no problem",
    "kw": "know",
    "kwl": "cool",
    "l2m": "listening to music",
    "l2p": "learn to play",
    "l33t": "leet",
    "l8": "late",
    "l8er": "later",
    "l8r": "later",
    "la": "laughing a lot",
    "laf": "laugh",
    "laffing": "laughing",
    "lafs": "love at first sight",
    "lam": "leave a message",
    "lamer": "lame person",
    "legit": "legitimate",
    "lemeno": "let me know",
    "lil": "little",
    "lk": "like",
    "llol": "literally laughing out loud",
    "lmho": "laughing my head off",
    "loi": "laughing on the inside",
    "lola": "love often, laugh a lot",
    "lolol": "lots of laugh out louds",
    "lolz": "laugh out louds",
    "ltr": "later",
    "lulz": "lol",
    "luv": "love",
    "luzr": "loser",
    "lv": "love",
    "ly": "love ya",
    "lya": "love you always",
    "lyk": "let you know",
    "lyn": "lying",
    "lysm": "love you so much",
    "m": "male",
    "mcd": "mcdonald's",
    "mcds": "mcdonald's",
    "md@u": "mad at you",
    "me2": "me too",
    "meh": "whatever",
    "mf": "mother fucker",
    "mfb": "mother fucking bitch",
    "mgmt": "management",
    "mid": "middle",
    "mil": "mother-in-law",
    "min": "minute",
    "mins": "minutes",
    "mk": "okay",
    "mkay": "ok",
    "mmk": "ok",
    "mms": "multimedia messaging service",
    "mng": "manage",
    "mngr": "manager",
    "mod": "modification",
    "mofo": "mother fucking",
    "mojo": "attractive talent",
    "moss": "chill",
    "ms": "miss",
    "msg": "message",
    "mtg": "meeting",
    "mth": "month",
    "mu": "miss you",
    "mu@": "meet you at",
    "muah": "kiss",
    "mula": "money",
    "mwa": "kiss",
    "mwah": "kiss",
    "n/m": "nevermind",
    "n/m/h": "nothing much here",
    "n/r": "no reserve",
    "n00b": "newbie",
    "n1": "nice one",
    "n1c": "no one cares",
    "n2m": "not too much",
    "n2mh": "not too much here",
    "n2w": "not to worry",
    "n64": "nintendo 64",
    "n8kd": "naked",
    "nac": "not a chance",
    "nah": "no",
    "nal": "nationality",
    "narc": "tattle tale",
    "nark": "informant",
    "naw": "no",
    "nb": "not bad",
    "nbd": "no big deal",
    "nbjf": "no brag, just fact",
    "nd": "and",
    "ne": "any",
    "ne1": "anyone",
    "ne1er": "anyone here",
    "neh": "no",
    "nemore": "anymore",
    "neva": "never",
    "neway": "anyway",
    "newaze": "anyways",
    "newb": "newbie",
    "nite": "night",
    "nn2r": "no need to reply",
    "nnito": "not necessarily in that order",
    "nnto": "no need to open",
    "nntr": "no need to reply",
    "no1": "no one",
    "noob": "newbie",
    "nooblet": "young newbie",
    "nooblord": "ultimate newbie",
    "notch": "minecraft creator",
    "nottie": "unattractive person",
    "np": "no problem",
    "nub": "newbie",
    "nuff": "enough",
    "nufn": "nothing",
    "num": "tasty",
    "nvm": "nevermind",
    "nvr": "never",
    "nvrm": "nevermind",
    "nw": "no way",
    "nxt": "next",
    "o4u": "only for you",
    "obtw": "oh, by the way",
    "obv": "obviously",
    "obvi": "obviously",
    "oc": "of course",
    "ohemgee": "oh my gosh",
    "oic": "oh, i see",
    "oicn": "oh, i see now",
    "oiy": "hi",
    "omg": "oh my god",
    "onl": "online",
    "onoz": "oh no",
    "orly": "oh really",
    "otay": "okay",
    "otw": "on the way",
    "outta": "out of",
    "ovie": "overlord",
    "ownage": "completely owned",
    "p/d": "per day",
    "p/m": "per month",
    "p/y": "per year",
    "p911": "parent alert!",
    "p@h": "parents at home",
    "pc": "personal computer",
    "pda": "public display of affection",
    "pic": "picture",
    "pj": "poor joke",
    "pl8": "plate",
    "pld": "played",
    "pls": "please",
    "plz": "please",
    "plzrd": "please read",
    "pov": "point of view",
    "ppl": "people",
    "ppp": "peace",
    "prof": "professor",
    "prolly": "probably",
    "promo": "promotion",
    "props": "recognition",
    "prot": "protection",
    "prvt": "private",
    "ps": "postscript",
    "ps2": "playstation 2",
    "ps3": "playstation 3",
    "psa": "public service announcement",
    "psog": "pure stroke of genius",
    "psp": "playstation portable",
    "ptm": "please tell me",
    "pwd": "password",
    "psd": "password",
    "pswd": "password",
    "pwnd": "owned",
    "pwned": "owned",
    "pwnt": "owned",
    "q4u": "question for you",
    "qfe": "quoted for emphasis",
    "qft": "quoted for truth",
    "qq": "quick question",
    "qqn": "looking",
    "qrg": "quick reference guide",
    "qt": "cutie",
    "qtpi": "cutie pie",
    "r": "are",
    "r8": "rate",
    "rdy": "ready",
    "re": "replay",
    "rehi": "hi again",
    "rents": "parents",
    "rep": "reputation",
    "resq": "rescue",
    "rgd": "regard",
    "rgds": "regards",
    "ridic": "ridiculous",
    "rip": "rest in peace",
    "rl": "real life",
    "rlrt": "real life retweet",
    "rly": "really",
    "rm": "room",
    "rn": "run",
    "rnt": "aren't",
    "rof": "laughing",
    "rofl": "laughing",
    "roflmao": "laughing",
    "roflol": "laughing out loud",
    "rolf": "laughing",
    "ru": "are you",
    "ruc": "are you coming?",
    "rut": "are you there?",
    "rx": "prescription",
    "s/o": "sold out",
    "s/u": "shut up",
    "s/w": "software",
    "s2r": "send to receive",
    "s2s": "sorry to say",
    "s2u": "same to you",
    "samzd": "still amazed",
    "sd": "sweet dreams",
    "sec": "second",
    "sho": "sure",
    "sh^": "shut up",
    "siul8r": "see you later",
    "siv": "bad goaltender",
    "sk8": "skate",
    "sk8r": "skater",
    "sly": "still love you",
    "smf": "so much fun",
    "smooch": "kiss",
    "sorta": "sort of",
    "spec": "specialization",
    "spk": "speak",
    "spkr": "speaker",
    "srry": "sorry",
    "srs": "serious",
    "srsly": "seriously",
    "sry": "sorry",
    "stpd": "stupid",
    "str": "strength",
    "str8": "straight",
    "sup": "what's up",
    "syl": "see you later",
    "sync": "synchronize",
    "t2go": "time to go",
    "t2m": "talk to me",
    "t2u": "talk to you",
    "t2ul": "talk to you later",
    "t2ul8er": "talk to you later",
    "t2ul8r": "talk to you later",
    "t4lmk": "thanks for letting me know",
    "t4p": "thanks for posting",
    "t4t": "thanks for trade",
    "tc": "take care",
    "teh": "the",
    "teme": "tell me",
    "tg": "thank goodness",
    "thnq": "thank you",
    "tho": "though",
    "thru": "through",
    "tht": "that",
    "thx": "thanks",
    "tl": "tell",
    "tlk": "talk",
    "tlkin": "talking",
    "tlking": "talking",
    "tomoz": "tomorrow",
    "tq": "thank you",
    "tqvm": "thank you very much",
    "tru": "true",
    "ttl": "talk to you later",
    "ttly": "totally",
    "ttul": "talk to you later",
    "tty": "talk to you",
    "tu": "thank you",
    "tude": "attitude",
    "tx": "thanks",
    "txt": "text",
    "txtin": "texting",
    "ty": "thank you",
    "tyfa": "thank you for asking",
    "tyl": "thank you lord",
    "tym": "thank you much",
    "tyt": "take your time",
    "tyvm": "thank you very much",
    "u": "you",
    "u-ok": "you ok?",
    "u/l": "upload",
    "u2": "you too",
    "u2u": "up to you",
    "uok": "you ok?",
    "ur": "your",
    "ut": "you there?",
    "veggies": "vegetables",
    "vry": "very",
    "vs": "versus",
    "w/": "with",
    "w/b": "welcome back",
    "w/e": "whatever",
    "w/o": "without",
    "w2f": "way too funny",
    "w2g": "way to go",
    "w2k": "windows 2000",
    "w4u": "wait for you",
    "w8": "wait",
    "w84m": "wait for me",
    "w8am": "wait a minute",
    "w8ing": "waiting",
    "w8n": "waiting",
    "wa": "what",
    "waa": "crying",
    "wack": "strange",
    "wan2": "want to",
    "wannabe": "want to be",
    "wat": "what",
    "watev": "whatever",
    "watevs": "whatever",
    "wlcm": "welcome",
    "wha": "what",
    "whipped": "tired",
    "wht": "what",
    "wk": "week",
    "wknd": "weekend",
    "wtf": "what the fuck",
    "wtg": "way to go",
    "wup": "what's up?",
    "ya": "yes",
    "yeap": "yes",
    "yep": "yes",
    "yepperz": "yes",
    "yesh": "yes",
    "yo": "hi",
    "yr": "your",
    "yrs": "years",
    "yt": "you there?",
    "yt?": "you there?",
    "yup": "yes",
    "yupz": "ok",
    "zzz": "sleeping",
    }
    word = word.lower()
    word = word.split()
    for i in range(len(word)):
        word[i] = slangs_dict.get(word[i], word[i])
    word = " ".join(word)
    return word


def replace_word(word):
    switcher = {
        "couldn't": "could not",
        "couldn": "could not",
        "won't": "will not",
        "won": "will not",
        "mustn't": "must not",
        "mustn": "must not",
        "that'll": "that will",
        "shouldn't": "should not",
        "shouldn": "should not",
        "should've": "should have",
        "haven't": "have not",
        "haven": "have not",
        "hadn't": "have not",
        "hadn": "have not",
        "hasn't": "have not",
        "hasn": "have not",
        "didn't": "do not",
        "didn": "do not",
        "doesn't": "do not",
        "doesn": "do not",
        "don't": "do not",
        "don": "do not", 
        "isn't": "be not",
        "you'd":"you would",
        "you've":"you have",
        "you're":"you are",
        "you'll":"you will",
        "she's":"she is",
        "she'd":"she would",
        "she'll":"she will",
        "he's":"he is",
        "he'd":"he would",
        "he'll":"he will",
        "it's":"it is",
        "it'd":"it would",
        "it'll":"it will",
        "aren't":"are not",
        "aren":"are not",
        "weren't":"were not",
        "weren":"were not",
        "wouldn't":"would not",
        "wouldn":"would not",
        "needn't":"need not",
        "needn":"need not",
        "wasn't":"was not",
        "wasn":"was not",
        "mightn't":"might not",
        "mightn":"might not",
        "shan't":"shall not",
        "shan":"shall not", 
        "can't":"can not",
        "i'm":"i am",
        "i'd":"i would",
        "i'll":"i will",
        "i've":"i have",
        "we're":"we are",
        "we'd":"we would",
        "we'll":"we will",
        "we've":"we have",
        "they're":"they are",
        "they'd":"they would",
        "they'll":"they will",
        "they've":"they have",
        "let's":"let us",
        "how's":"how is",
        "here's":"here is",
        "what's":"what is",
        "there's":"there is",
        "0":"zero",
        "1":"one",
        "2":"two",
        "3":"three",
        "4":"four",
        "5":"five",
        "6":"six",
        "7":"oseven",
        "8":"eight",
        "9":"nine",
        "10":"ten",
    }
    word = word.lower()
    word = word.split()
    for i in range(len(word)):
        word[i] = switcher.get(word[i], word[i])
    word = " ".join(word)
    return word

def transformText(text):
    text = split_alphanum(text)
    # Convert text to lower
    text = text.lower()
    text = replace_word(text)
    text = normaliser_word(text)
    #stops = set(stopwords.words("english"))

    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in STOP_WORDS]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # Correct words
    spell = SpellChecker()
    misspelled = text.split()
    wordnet_lemmatizer = WordNetLemmatizer()
    for i in range(len(misspelled)):
        # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v")
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n")
    text = " ".join(misspelled)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    return text

In [12]:
transformText("I have so 2 servic going groop")

'service go group'

In [13]:
df['text'] = df['text'].map(transformText)
texts= df['text']
tags= df['label']


In [14]:
df=df[texts,tags]

TypeError: '(0       have order data cable get well finish work pro...
1                                              love phone
2                                 get well finish product
3                                          not be happier
4                  be look headset long time now have get
5                            headset start ring no reason
6                display be excellent camera be good year
7                              battery life be also great
8               worst phone have ever have have few month
9       not good item work start have problem auto rev...
10       be embarrass also ear hurt try push ear plug ear
11                                     protect phone side
12      be average phone bad battery life operate weak...
13           clear skye call long battery life long range
14                                   solo technology suck
15                                 great hand free device
16      can even take self portrait outside exterior d...
17                       same problem others have mention
18      try many many handsfree gadget be one finally ...
19                                           magical help
20      worst piece of crap ever along version custome...
21                                     poor sound quality
22                                      best phone market
23                                              work well
24                company ship product promptly work well
25                                           exactly want
26      picture resolution be far comparably price pho...
27                                          be great deal
28                     excellent product satisfy purchase
29             highly recommend encourage people give try
                              ...                        
4025                  flair bartender be absolutely amaze
4026                 freeze margarita be way sugary taste
4027                                  be good order twice
4028    nutshell 1 restaurant smell like combination o...
4029                            girlfriend be veal be bad
4030                            unfortunately be not good
4031                       have pretty satisfy experience
4032                join club get awesome offer via email
4033    perfect someone like beer ice cold case even c...
4034    bland flavourless be good way of describe bare...
4035                    chain no fan of beat place easily
4036                                        nacho be have
4037                                     not be come back
4038    do nothave many word say place do everything p...
4039    staff be super nice quick even crazy crowd of ...
4040               great atmosphere friendly fast service
4041       receive pity be huge do have lot of meat thumb
4042                                       food arrive be
4043    pay 7 85 hot dog fry look like come of kid be ...
4044              classic maine lobster roll be fantastic
4045    brother law work mall eat same day guess be si...
4046                      good go have review place twice
4047             chip salsa be really good salsa be fresh
4048                                       place be great
4049                                        mediocre food
4050                          get inside be impress place
4051                            service be super friendly
4052                     be sad little vegetable overcook
4053                               place be nice surprise
4054                              live music totally blow
Name: text, Length: 4055, dtype: object, 0       NOTISSUE
1       NOTISSUE
2       NOTISSUE
3       NOTISSUE
4       NOTISSUE
5          ISSUE
6       NOTISSUE
7       NOTISSUE
8          ISSUE
9          ISSUE
10         ISSUE
11      NOTISSUE
12         ISSUE
13      NOTISSUE
14         ISSUE
15      NOTISSUE
16      NOTISSUE
17         ISSUE
18      NOTISSUE
19      NOTISSUE
20         ISSUE
21         ISSUE
22      NOTISSUE
23      NOTISSUE
24      NOTISSUE
25      NOTISSUE
26         ISSUE
27      NOTISSUE
28      NOTISSUE
29      NOTISSUE
          ...   
4025    NOTISSUE
4026       ISSUE
4027    NOTISSUE
4028       ISSUE
4029       ISSUE
4030       ISSUE
4031    NOTISSUE
4032    NOTISSUE
4033    NOTISSUE
4034       ISSUE
4035       ISSUE
4036    NOTISSUE
4037       ISSUE
4038    NOTISSUE
4039    NOTISSUE
4040    NOTISSUE
4041    NOTISSUE
4042       ISSUE
4043       ISSUE
4044    NOTISSUE
4045       ISSUE
4046    NOTISSUE
4047    NOTISSUE
4048    NOTISSUE
4049       ISSUE
4050    NOTISSUE
4051    NOTISSUE
4052       ISSUE
4053    NOTISSUE
4054    NOTISSUE
Name: label, Length: 4055, dtype: object)' is an invalid key

In [18]:
# dictionary of lists  
dict = {'text': texts, 'label': tags } 
     
df = pd.DataFrame(dict) 
  
# saving the dataframe 
df.to_csv('DATA2_preprocessing.csv')

In [19]:
####

In [13]:
DATA_FILE = 'DATA2_preprocessing.csv'
df = pd.read_csv(DATA_FILE,delimiter=',',encoding='UTF-8')

In [14]:
msk = np.random.rand(len(df)) < 0.7 # Splitting into train(70%) and test(30%) randomly

In [15]:
train_df=df[msk]
test_df=df[~msk]

In [16]:
print(train_df.shape)
print(test_df.shape)

(2857, 3)
(1198, 3)


In [17]:
print('Checking target values for train data:\n')
print(train_df['label'].value_counts(),'\n')
print('Checking target values for test data:\n')
print(test_df['label'].value_counts())

Checking target values for train data:

ISSUE       1439
NOTISSUE    1418
Name: label, dtype: int64 

Checking target values for test data:

NOTISSUE    612
ISSUE       586
Name: label, dtype: int64


In [18]:
train_df['label'].value_counts()


ISSUE       1439
NOTISSUE    1418
Name: label, dtype: int64

In [19]:
x_train=train_df['text']
y_train=train_df['label']
x_test=test_df['text']
y_test=test_df['label']

### GLOVE EMBEDDING 300

In [20]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 200
maxlen = 120  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [21]:
texts_train = x_train.astype(str)
texts_test = x_test.astype(str)

In [22]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

In [23]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
test_data = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")

In [24]:
print('Shape of data tensor:', data.shape)
print('Shape of test_data tensor:', test_data.shape)

Shape of data tensor: (2857, 200)
Shape of test_data tensor: (1198, 200)


In [25]:
import pickle
file_tok = 'LSTM_token_glove_300d_DATA_wit_text_processing.sav'
pickle.dump(tokenizer, open(file_tok, 'wb'))

In [26]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 3170 unique tokens.


In [27]:
#pad sequences are used to bring all sentences to same size.
# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)

Shape of data tensor: (2857, 200)
Shape of data test tensor: (1198, 200)


In [28]:
y_train = y_train.map({"NOTISSUE": 1, "ISSUE" : 0 })
y_test = y_test.map({"NOTISSUE": 1, "ISSUE" : 0 })

In [29]:
# load the whole embedding into memory
embeddings_index = {}
f = open('C:/Users/khmar/Desktop/GLOVE/glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [30]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
#Found 1489 unique tokens : word_index
#print(len(word_index) + 1) #### 1490 
#print(word_index.items()) ### unique tokens : words :word_index
#print('embedding_matrix',embedding_matrix)
out_of_vocab={}
for word, i in word_index.items():
    #print(i)
    embedding_vector = embeddings_index.get(word)
    #print('embedding_vector' ,embedding_vector )
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        #print('word: ', word)
        embedding_matrix[i] = embedding_vector
        #print('embedding_matrix :',embedding_matrix[i] )
        # words not found in embedding index will be all-zeros.
    if embedding_vector is  None:
        print('word not found :',word)
        out_of_vocab[i] = word
            

word not found : nothave
word not found : notwaste
word not found : notknow
word not found : overprice
word not found : notrecommend
word not found : flavourful
word not found : notbother
word not found : notexpect
word not found : notalways
word not found : notunderstand
word not found : notenjoy
word not found : flavourless
word not found : notwait
word not found : notseem
word not found : notsave
word not found : appal
word not found : notbelieve
word not found : notwant
word not found : nothelp
word not found : notskimp
word not found : shawarrrrrrma
word not found : notdisappoint
word not found : 99900
word not found : barset
word not found : plantronincs
word not found : 3715
word not found : notproduce
word not found : flipphones
word not found : notupload
word not found : a325
word not found : notslide
word not found : tracfonewebsite
word not found : 5320
word not found : supertooth
word not found : 3265
word not found : 8125
word not found : nottrust
word not found : 8525
wor

In [31]:
f'There are {len(out_of_vocab)} out of vocab '

'There are 80 out of vocab '

In [49]:
f'There are {len(out_of_vocab)} out of vocab '

'There are 76 out of vocab '

In [32]:
model_glove= Sequential()
model_glove.add(Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model_glove.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model_glove.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [33]:
model_glove.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [34]:
model_glove.fit(data, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(test_data, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 2857 samples, validate on 1198 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x156f5b3eb38>

In [53]:
scores = model_glove.evaluate(x_test, y_test,
                            batch_size=batch_size)
print("%s: %.2f%%" % (model_glove.metrics_names[0], scores[0] * 100))
print("%s: %.2f%%" % (model_glove.metrics_names[1], scores[1] * 100))

loss: 40.89%
acc: 81.41%


In [47]:
# Save the model
model_glove.save('LSTM_model_glove_300_DATA_with_text_processing.sav')