In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
#
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)
print("import done")

Using TensorFlow backend.


import done


In [2]:
import os
DATA_FILE = os.path.abspath('DATA/DATA_not_ameliorate.csv')
df = pd.read_csv(DATA_FILE,delimiter=';',encoding='UTF-8')
print(df.head())

                                                text     label
0  I had ordered a data cable, got a very well fi...  NOTISSUE
1                                   Love This Phone.  NOTISSUE
2                I get a very well finished product.  NOTISSUE
3                             I couldn't be happier.  NOTISSUE
4  I was looking for this headset for a long time...  NOTISSUE


In [3]:
df['label'].value_counts() #balanced Dataset


NOTISSUE    2028
ISSUE       2027
Name: label, dtype: int64

In [4]:
df.shape

(4055, 2)

### Preprocessing the Data

In [5]:
# List stop words 
stop_words_list={
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 #'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 #'became',
 'because',
 #'become',
 #'becomes',
 #'becoming',
 #'been',
 'before',
 'beforehand',
 #'behind',
 #'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 #'call',
 'can',
 'cannot',
 'could',
 'did',
 #'do',
 #'does',
 #'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 #'everything',
 'everywhere',
 'except',
 'few',
 'first',
 'for',
 'former',
 'formerly',
 'from',
 'front',
 'full',
 'further',
 #'had',
 #'has',
 #'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 #'is',
 'it',
 'its',
 'itself',
 'just',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 #'made',
 #'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 #'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'nevertheless',
 'next',
 'noone',
 'nor',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 #'say',
 #'see',
 #'seem',
 #'seemed',
 #'seeming',
 #'seems',
 'serious',
 'several',
 'she',
 'should',
 #'show',
 'side',
 'since',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 #'take',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 #'toward',
 #'towards',
 'twelve',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 #'used',
 'using',
 'various',
 'very',
 'via',
 #'was',
 'we',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [6]:
appos_dict = {
    "can't": "cannot",
    "cant": "cannot",
    "aren't": "are not",
    "arent": "are not",
    "couldn't": "could not",
    "couldnt": "could not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont": "do not",
    "hadn't": "had not",
    "hadnt": "had not",
    "hasn't": "has not",
    "hasnt": "has not",
    "haven't": "have not",
    "havent": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "I would",
    "i'll": "I will",
    "i'm": "I am",
    "im": "I am",
    "isn't": "is not",
    "isnt": "is not",
    "it's": "it is",
    "it'll": "it will",
    "i've": "I have",
    "let's": "let us",
    "mightn't": "might not",
    "mightnt": "might not",
    "mustn't": "must not",
    "mustnt": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't": "should not",
    "shouldnt": "should not",
    "that's": "that is",
    "thats": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "weren't": "were not",
    "we've": "we have",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "whats": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "wont": "will not",
    "wouldn't": "would not",
    "wouldnt": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have",
    "wasn't": "was not",
    "wasnt": "was not",
    "we'll": "will",
    "didn't": "did not",
    "didnt": "did not"
}

In [89]:
abbreviation_dict= {
    'awsm': 'awesome',
    "aamof": "as a matter of fact",
    "abt": "about",
    "abt2": "about to",
    "ac": "air conditioning",
    "ace": "solo winner",
    "ack": "acknowledged",
    "admin": "administrator",
    "thr": "there",
    "frm": "from",
    "aggro": "aggression",
    "agl": "angel",
    "dob": "date of birth",
    "ai": "artificial intelligence",
    "aiic": "as if i care",
    "aka": "also known as",
    "alap": "as long as possible",
    "alol": "actually laughing out loud",
    "ama": "ask me anything",
    "amap": "as much as possible",
    "amazn": "amazing",
    "ammo": "ammunition",
    "ams": "ask me something",
    "anon": "anonymous",
    "asap": "as soon as possible",
    "asat": "as simple as that",
    "awks": "awkward",
    "awl": "always with love",
    "ayk": "as you know",
    "azm": "awesome",
    "b": "be",
    "b&w": "black and white",
    "b-day": "birthday",
    "bday": "birthday",
    "bcoz": "because",
    "bcos": "because",
    "bcz": "because",
    "bf": "boyfriend",
    "btw": "between",
    "b4": "before",
    "bai": "bye",
    "bb": "bye bye",
    "bc": "abuse",
    "mc": "abuse",
    "bcc": "blind carbon copy",
    "bff": "best friends forever",
    "biz": "business",
    "bk": "back",
    "bo": "back off",
    "bro": "brother",
    "btwn": "between",
    "c": "see",
    "char": "character",
    "combo": "combination",
    "cu": "see you",
    "cu2": "see you too",
    "cu2mr": "see you tomorrow",
    "cya": "see ya",
    "cyal8r": "see you later",
    "cyb": "call you back",
    "cye": "check your e-mail",
    "cyf": "check your facebook",
    "cyfb": "check your facebook",
    "cyl": "catch ya later",
    "cym": "check your myspace",
    "cyo": "see you online",
    "d8": "date",
    "da": "the",
    "dece": "decent",
    "ded": "dead",
    "dept": "department",
    "dis": "this",
    "ditto": "same",
    "diva": "rude woman",
    "dk": "don't know",
    "dlm": "don't leave me",
    "dm": "direct message",
    "dnd": "do not disturb",
    "dno": "don't know",
    "dnt": "don't",
    "e1": "everyone",
    "eg": "for example",
    "emc2": "genius",
    "emo": "emotional",
    "enuf": "enough",
    "eod": "end of discussion",
    "eof": "end of file",
    "eom": "end of message",
    "eta": "estimated time of arrival",
    "every1": "everyone",
    "evs": "whatever",
    "exp": "experience",
    "f": "female",
    "f2f": "face to face",
    "f2p": "free to play",
    "f2t": "free to talk",
    "f9": "fine",
    "fab": "fabulous",
    "fail": "failure",
    "faq": "frequently asked questions",
    "fav": "favorite",
    "fave": "favorite",
    "favs": "favorites",
    "fb": "facebook",
    "fbc": "facebook chat",
    "fbf": "facebook friend",
    "fml": "family",
    "fn": "fine",
    "fo": "freaking out",
    "fri": "friday",
    "frnd": "friend",
    "fu": "fuck you",
    "fugly": "fucking ugly",
    "gf": "girlfriend",
    "g1": "good one",
    "g2b": "going to bed",
    "g2cu": "good to see you",
    "g2g": "good to go",
    "g4i": "go for it",
    "g4n": "good for nothing",
    "g4u": "good for you",
    "g9": "goodnight",
    "ga": "go ahead",
    "ge": "good evening",
    "gl": "good luck",
    "gm": "good morning",
    "gn": "goodnight",
    "gonna": "going to",
    "goon": "idiot",
    "gorge": "gorgeous",
    "gr8": "great",
    "grats": "congratulations",
    "gratz": "congratulations",
    "grl": "girl",
    "gt2t": "got time to talk",
    "gtg": "good to go",
    "gud": "good",
    "gv": "give",
    "gvn": "given",
    "gw": "good work",
    "h/o": "hold on",
    "h/p": "hold please",
    "h/t": "hat tip",
    "h/u": "hook up",
    "h2cus": "hope to see you soon",
    "h4u": "hot for you",
    "h4x0r": "hacker",
    "h4x0rz": "hackers",
    "h8": "hate",
    "h8r": "hater",
    "h8t": "hate",
    "ha": "hello again",
    "haha": "laughing",
    "hai": "hi",
    "hak": "hugs and kisses",
    "han": "how about now?",
    "hav": "have",
    "hax": "hacks",
    "haxor": "hacker",
    "hay": "how are you",
    "hb2u": "happy birthday to you",
    "hbbd": "happy belated birthday",
    "hbd": "happy birthday",
    "hc": "how cool",
    "hcit": "how cool is that",
    "hehe": "laughing",
    "hf": "have fun",
    "hi5": "high five",
    "hig": "how's it going?",
    "hih": "hope it helps",
    "ho": "hold on",
    "hoc": "house of cards",
    "hof": "hall of fame",
    "holla": "holler",
    "hom": "hit or miss",
    "hood": "neighborhood",
    "hoops": "basketball",
    "hottie": "attractive person",
    "hr": "human resources",
    "hru": "how are you",
    "hry": "hurry",
    "hubby": "husband",
    "hwk": "homework",
    "hwp": "height weight proportionate",
    "hwu": "hey, what's up?",
    "hxc": "hardcore",
    "h^": "hook up",
    "i8": "i ate",
    "i8u": "i hate you",
    "ia": "i agree",
    "iab": "in a bit",
    "iac": "in any case",
    "iad": "it all depends",
    "iae": "in any event",
    "iag": "it's all good",
    "iagw": "in a good way",
    "iail": "i am in love",
    "iam": "in a minute",
    "ic": "i see",
    "id10t": "idiot",
    "idc": "i don't care",
    "idd": "indeed",
    "idi": "i doubt it",
    "idk": "i don't know",
    "idky": "i don't know why",
    "idmb": "i'll do my best",
    "idn": "i don't know",
    "idnk": "i do not know",
    "idr": "i don't remember",
    "idt": "i don't think",
    "idts": "i don't think so",
    "idtt": "i'll drink to that",
    "idu": "i don't understand",
    "ie": "that is",
    "ig2p": "i got to pee",
    "iggy": "ignored",
    "ight": "alright",
    "igi": "i get it",
    "ign": "in-game name",
    "igtp": "i get the point",
    "ih8u": "i hate you",
    "ihu": "i hate you",
    "ihy": "i hate you",
    "ii": "i'm impressed",
    "iiok": "if i only knew",
    "iir": "if i remember",
    "iirc": "if i remember correctly",
    "iit": "i'm impressed too",
    "iiuc": "if i understand correctly",
    "ik": "i know",
    "ikhyf": "i know how you feel",
    "ikr": "i know, right?",
    "ikwum": "i know what you mean",
    "ikwym": "i know what you mean",
    "ikyd": "i know you did",
    "ilu": "i like you",
    "ilu2": "i love you too",
    "ilub": "i love you baby",
    "ilyk": "i'll let you know",
    "ilyl": "i love you lots",
    "ilysm": "i love you so much",
    "ima": "i'm",
    "imma": "i'm gonna",
    "imo": "in my opinion",
    "imy": "i miss you",
    "inb4": "in before",
    "inc": "incoming",
    "indie": "independent",
    "info": "information",
    "init": "initialize",
    "ipo": "initial public offering",
    "ir": "in room",
    "ir8": "irate",
    "irdk": "i really don't know",
    "irl": "in real life",
    "iyo": "in your opinion",
    "iyq": "i like you",
    "j/k": "just kidding",
    "j/p": "just playing",
    "j/w": "just wondering",
    "j2lyk": "just to let you know",
    "j4f": "just for fun",
    "j4g": "just for grins",
    "jas": "just a second",
    "jb/c": "just because",
    "joshing": "joking",
    "k": "ok",
    "k3u": "i love you",
    "kappa": "sarcasm",
    "kek": "korean laugh",
    "keke": "korean laugh",
    "kewl": "cool",
    "kewt": "cute",
    "kfc": "kentucky fried chicken",
    "kgo": "ok, go",
    "kik": "laughing out loud",
    "kinda": "kind of",
    "kk": "ok",
    "kl": "kool",
    "km": "kiss me",
    "kma": "kiss my ass",
    "knp": "ok, no problem",
    "kw": "know",
    "kwl": "cool",
    "l2m": "listening to music",
    "l2p": "learn to play",
    "l33t": "leet",
    "l8": "late",
    "l8er": "later",
    "l8r": "later",
    "la": "laughing a lot",
    "laf": "laugh",
    "laffing": "laughing",
    "lafs": "love at first sight",
    "lam": "leave a message",
    "lamer": "lame person",
    "legit": "legitimate",
    "lemeno": "let me know",
    "lil": "little",
    "lk": "like",
    "llol": "literally laughing out loud",
    "lmho": "laughing my head off",
    "loi": "laughing on the inside",
    "lola": "love often, laugh a lot",
    "lolol": "lots of laugh out louds",
    "lolz": "laugh out louds",
    "ltr": "later",
    "lulz": "lol",
    "luv": "love",
    "luzr": "loser",
    "lv": "love",
    "ly": "love ya",
    "lya": "love you always",
    "lyk": "let you know",
    "lyn": "lying",
    "lysm": "love you so much",
    "m": "male",
    "mcd": "mcdonald's",
    "mcds": "mcdonald's",
    "md@u": "mad at you",
    "me2": "me too",
    "meh": "whatever",
    "mf": "mother fucker",
    "mfb": "mother fucking bitch",
    "mgmt": "management",
    "mid": "middle",
    "mil": "mother-in-law",
    "min": "minute",
    "mins": "minutes",
    "mk": "okay",
    "mkay": "ok",
    "mmk": "ok",
    "mms": "multimedia messaging service",
    "mng": "manage",
    "mngr": "manager",
    "mod": "modification",
    "mofo": "mother fucking",
    "mojo": "attractive talent",
    "moss": "chill",
    "ms": "miss",
    "msg": "message",
    "mtg": "meeting",
    "mth": "month",
    "mu": "miss you",
    "mu@": "meet you at",
    "muah": "kiss",
    "mula": "money",
    "mwa": "kiss",
    "mwah": "kiss",
    "n/m": "nevermind",
    "n/m/h": "nothing much here",
    "n/r": "no reserve",
    "n00b": "newbie",
    "n1": "nice one",
    "n1c": "no one cares",
    "n2m": "not too much",
    "n2mh": "not too much here",
    "n2w": "not to worry",
    "n64": "nintendo 64",
    "n8kd": "naked",
    "nac": "not a chance",
    "nah": "no",
    "nal": "nationality",
    "narc": "tattle tale",
    "nark": "informant",
    "naw": "no",
    "nb": "not bad",
    "nbd": "no big deal",
    "nbjf": "no brag, just fact",
    "nd": "and",
    "ne": "any",
    "ne1": "anyone",
    "ne1er": "anyone here",
    "neh": "no",
    "nemore": "anymore",
    "neva": "never",
    "neway": "anyway",
    "newaze": "anyways",
    "newb": "newbie",
    "nite": "night",
    "nn2r": "no need to reply",
    "nnito": "not necessarily in that order",
    "nnto": "no need to open",
    "nntr": "no need to reply",
    "no1": "no one",
    "noob": "newbie",
    "nooblet": "young newbie",
    "nooblord": "ultimate newbie",
    "notch": "minecraft creator",
    "nottie": "unattractive person",
    "np": "no problem",
    "nub": "newbie",
    "nuff": "enough",
    "nufn": "nothing",
    "num": "tasty",
    "nvm": "nevermind",
    "nvr": "never",
    "nvrm": "nevermind",
    "nw": "no way",
    "nxt": "next",
    "o4u": "only for you",
    "obtw": "oh, by the way",
    "obv": "obviously",
    "obvi": "obviously",
    "oc": "of course",
    "ohemgee": "oh my gosh",
    "oic": "oh, i see",
    "oicn": "oh, i see now",
    "oiy": "hi",
    "omg": "oh my god",
    "onl": "online",
    "onoz": "oh no",
    "orly": "oh really",
    "otay": "okay",
    "otw": "on the way",
    "outta": "out of",
    "ovie": "overlord",
    "ownage": "completely owned",
    "p/d": "per day",
    "p/m": "per month",
    "p/y": "per year",
    "p911": "parent alert!",
    "p@h": "parents at home",
    "pc": "personal computer",
    "pda": "public display of affection",
    "pic": "picture",
    "pj": "poor joke",
    "pl8": "plate",
    "pld": "played",
    "pls": "please",
    "plz": "please",
    "plzrd": "please read",
    "pov": "point of view",
    "ppl": "people",
    "ppp": "peace",
    "prof": "professor",
    "prolly": "probably",
    "promo": "promotion",
    "props": "recognition",
    "prot": "protection",
    "prvt": "private",
    "ps": "postscript",
    "ps2": "playstation 2",
    "ps3": "playstation 3",
    "psa": "public service announcement",
    "psog": "pure stroke of genius",
    "psp": "playstation portable",
    "ptm": "please tell me",
    "pwd": "password",
    "psd": "password",
    "pswd": "password",
    "pwnd": "owned",
    "pwned": "owned",
    "pwnt": "owned",
    "q4u": "question for you",
    "qfe": "quoted for emphasis",
    "qft": "quoted for truth",
    "qq": "quick question",
    "qqn": "looking",
    "qrg": "quick reference guide",
    "qt": "cutie",
    "qtpi": "cutie pie",
    "r": "are",
    "r8": "rate",
    "rdy": "ready",
    "re": "replay",
    "rehi": "hi again",
    "rents": "parents",
    "rep": "reputation",
    "resq": "rescue",
    "rgd": "regard",
    "rgds": "regards",
    "ridic": "ridiculous",
    "rip": "rest in peace",
    "rl": "real life",
    "rlrt": "real life retweet",
    "rly": "really",
    "rm": "room",
    "rn": "run",
    "rnt": "aren't",
    "rof": "laughing",
    "rofl": "laughing",
    "roflmao": "laughing",
    "roflol": "laughing out loud",
    "rolf": "laughing",
    "ru": "are you",
    "ruc": "are you coming?",
    "rut": "are you there?",
    "rx": "prescription",
    "s/o": "sold out",
    "s/u": "shut up",
    "s/w": "software",
    "s2r": "send to receive",
    "s2s": "sorry to say",
    "s2u": "same to you",
    "samzd": "still amazed",
    "sd": "sweet dreams",
    "sec": "second",
    "sho": "sure",
    "sh^": "shut up",
    "siul8r": "see you later",
    "siv": "bad goaltender",
    "sk8": "skate",
    "sk8r": "skater",
    "sly": "still love you",
    "smf": "so much fun",
    "smooch": "kiss",
    "sorta": "sort of",
    "spec": "specialization",
    "spk": "speak",
    "spkr": "speaker",
    "srry": "sorry",
    "srs": "serious",
    "srsly": "seriously",
    "sry": "sorry",
    "stpd": "stupid",
    "str": "strength",
    "str8": "straight",
    "sup": "what's up",
    "syl": "see you later",
    "sync": "synchronize",
    "t2go": "time to go",
    "t2m": "talk to me",
    "t2u": "talk to you",
    "t2ul": "talk to you later",
    "t2ul8er": "talk to you later",
    "t2ul8r": "talk to you later",
    "t4lmk": "thanks for letting me know",
    "t4p": "thanks for posting",
    "t4t": "thanks for trade",
    "tc": "take care",
    "teh": "the",
    "teme": "tell me",
    "tg": "thank goodness",
    "thnq": "thank you",
    "tho": "though",
    "thru": "through",
    "tht": "that",
    "thx": "thanks",
    "tl": "tell",
    "tlk": "talk",
    "tlkin": "talking",
    "tlking": "talking",
    "tomoz": "tomorrow",
    "tq": "thank you",
    "tqvm": "thank you very much",
    "tru": "true",
    "ttl": "talk to you later",
    "ttly": "totally",
    "ttul": "talk to you later",
    "tty": "talk to you",
    "tu": "thank you",
    "tude": "attitude",
    "tx": "thanks",
    "txt": "text",
    "txtin": "texting",
    "ty": "thank you",
    "tyfa": "thank you for asking",
    "tyl": "thank you lord",
    "tym": "thank you much",
    "tyt": "take your time",
    "tyvm": "thank you very much",
    "u": "you",
    "u-ok": "you ok?",
    "u/l": "upload",
    "u2": "you too",
    "u2u": "up to you",
    "uok": "you ok?",
    "ur": "your",
    "ut": "you there?",
    "veggies": "vegetables",
    "vry": "very",
    "vs": "versus",
    "w/": "with",
    "w/b": "welcome back",
    "w/e": "whatever",
    "w/o": "without",
    "w2f": "way too funny",
    "w2g": "way to go",
    "w2k": "windows 2000",
    "w4u": "wait for you",
    "w8": "wait",
    "w84m": "wait for me",
    "w8am": "wait a minute",
    "w8ing": "waiting",
    "w8n": "waiting",
    "wa": "what",
    "waa": "crying",
    "wack": "strange",
    "wan2": "want to",
    "wannabe": "want to be",
    "wat": "what",
    "watev": "whatever",
    "watevs": "whatever",
    "wlcm": "welcome",
    "wha": "what",
    "whipped": "tired",
    "wht": "what",
    "wk": "week",
    "wknd": "weekend",
    "wtf": "what the fuck",
    "wtg": "way to go",
    "wup": "what's up?",
    "ya": "yes",
    "yeap": "yes",
    "yep": "yes",
    "yepperz": "yes",
    "yesh": "yes",
    "yo": "hi",
    "yr": "your",
    "yrs": "years",
    "yt": "you there?",
    "yt?": "you there?",
    "yup": "yes",
    "yupz": "ok",
    "zzz": "sleeping",
}

In [90]:
import nltk
import inflect
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import parsing
#from gensim.parsing.preprocessing import split_alphanum 0==>99
from spellchecker import SpellChecker
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import re

In [91]:
sb_stem = SnowballStemmer("english", ignore_stopwords=True)
pt_stem = PorterStemmer()
lmtzr = WordNetLemmatizer()

##Convert apostrophes word to original form
def replace_numbers(word):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    word = word.split()
    for i in range(len(word)):
        if word[i].isdigit():
            word[i] = p.number_to_words(word[i])
    word = " ".join(word)
    return word



"""  Fixing Word Lengthening
##https://rustyonrampa"ge.github.io/text-mining/2017/11/28/spelling-correction-with-python-and-nltk.html"""
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def abbreviation_look_up(text):
    """
    Replace abbreviation word in text to their original form
    Example: hi, thanq so mch => hi, thank you so much
    Args:
        text (str): text
    Returns:
        slanged (str): cleaned text with replaced slang
    """
    words = text.split()
    new_text = []

    for word in words:
        word_s = word.lower()
        if word_s in abbreviation_dict:
            new_text.append(abbreviation_dict[word_s])
        else:
            new_text.append(word)
    slanged = " ".join(new_text)
    return slanged

def appos_look_up(text):
    """
    Convert apostrophes word to original form
    Example: I don't know what is going on?  => I do not know what is going on? 
    Args:
        text (str): text 
    Returns:
        apposed (str) : text with converted apostrophes
    """
    words = text.split()
    new_text = []
    for word in words:
        word_s = word.lower()
        if word_s in appos_dict:
            new_text.append(appos_dict[word_s])
        else:
            new_text.append(word)
    apposed = " ".join(new_text)
    return apposed


def correct_word(text):
    # Correct words
    spell = SpellChecker()
    misspelled = text.split()
    wordnet_lemmatizer = WordNetLemmatizer()
    for i in range(len(misspelled)):
        # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v")
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n")
        text = " ".join(misspelled)
    return text


def remove_repeated_characters(text):
    """
    Remove repeated characters (>2) in words to max limit of 2
    Example: I am verrry happpyyy today => I am verry happyy today
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with removed repeated chars
    """
    regex_pattern = re.compile(r'(.)\1+')
    clean_text = regex_pattern.sub(r'\1\1', text)
    return clean_text


def separate_digit_text(text):
    """
    Separate digit and words with space in text
    Example: I will be booking tickets for 2adults => I will be booking tickets for 2 adults   
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with separated digits and words
    """
    regex_patter = re.compile(r'([\d]+)([a-zA-Z]+)')
    clean_text = regex_patter.sub(r'\1 \2', text)
    return clean_text




def stem_text(text, stemmer='snowball'):
    """
    Convert words in text into their root form
    Example: I am playing in ground => I am play in ground 
    Args:
        text (str): text
        
    Returns:
        text_stem (str): cleaned text with replaced stem words
    """
    #text = remove_inside_braces(text)
    tokens = word_tokenize(text)
    if stemmer == 'snowball':
        text_stem = " ".join([sb_stem.stem(w) for w in tokens])
    else:
        text_stem = " ".join([pt_stem.stem(w) for w in tokens])
    
    return text_stem


def remove_single_char_word(text):
    """
    Remove single character word from text
    Example: I am in a home for 2 years => am in home for years 
    Args:
        text (str): text
         
    Returns:
        (str): text with single char removed
    """
    words = text.split()
    filter_words = [word for word in words if len(word) > 1]
    return " ".join(filter_words)


def remove_punctuations(text):
    """
    Removed special characters from text
    Example: he: I am going. are you coming? => he I am going. are you coming
   
    Args:
        text (str): text
   
    Returns:
        clean_text (str): cleaned text with removed special characters
    """
    regex_pattern = re.compile(r'[\,+\:\?\!\"\(\)!\'\.\%\[\]]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text


def remove_extra_space(text):
    """
    Remove extra white spaces space from text
    Example: hey are   you coming. ? => he are you coming. ?
    Args:
        text (str): text
    Returns:
        clean_text (str): clean text with removed extra white spaces
    """
    #text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    clean_text = ' '.join(text.strip().split())
    return clean_text


def replace_digits_with_char(text, replace_char='d'):
    """
    Replace digits to `replace_char`
    Example: I will be there on 22 april. => I will be there on dd april.
    Args:
        text (str): text
        replace_char (str): character with which digit has to be replaced
    Returns:
        clean_text (str): clean text with replaced char for digits
    """
    regex_pattern = re.compile(r'[0-9]')
    clean_text = regex_pattern.sub(replace_char, text)
    return clean_text




def remove_url(text):
    """
    Remove urls from text
    Example: link to latest cricket score. https://xyz.com/a/b => link to latest cricket score.
    Args:
        text (str): text
    Returns:
        text (str): text with removed urls
    """

    urlfree = []
    for word in text.split():
        if not word.startswith("www"):
            urlfree.append(word)
        elif not word.startswith("http"):
            urlfree.append(word)
        elif not word.endswith(".html"):
            urlfree.append(word)
    urlfree = " ".join(urlfree)

    urls = re.finditer(r'http[\w]*:\/\/[\w]*\.?[\w-]+\.+[\w]+[\/\w]+', urlfree)
    for i in urls:
        urlfree = re.sub(i.group().strip(), '', urlfree)
    return urlfree


def remove_alphanumerics(text):
    """
    Remove alphanumeric words from text
    Example: hello man whatsup123 => hello man
    Args:
        text (str): text
    Returns:
        text (str): text with removed alphanumeric words
    """
    txt = []
    for each in text.split():
        if not any(x in each.lower() for x in "0123456789"):
            txt.append(each)
    txtsent = " ".join(txt)
    return txtsent 


def remove_words_start_with(text, starts_with_char):
    """
    Remove words start with character `starts_with_char`
    Example: dhoni rocks with last ball six #dhoni #six => dhoni rocks with last ball six (start_char_with='#')
    Args:
        text (str): text
        starts_with_char (str): starting characters of word, which to be removed from text
    Returns:
        text (str): text with removed words start with given chars
    """
    urls = re.finditer(starts_with_char + r'[A-Za-z0-9\w]*', text)
    for i in urls:
        text = re.sub(i.group().strip(), '', text)
    return text.strip()

def remove_stop_words(text, stop_words=stop_words_list):
    """
    This function removes stop words from text
    Example: I am very excited for today's football match => very excited today's football match
    Params
        text (str) :text on which processing needs to done
        stop_words (list) : stop words which needs to be removed
    Returns
        text(str): text after stop words removal
    """
    stop_words = set(stop_words)
    split_list = text.split(" ")
    split_list = [word for word in split_list if word not in stop_words]
    return " ".join(split_list)

In [97]:
def transformText(text):
    text = split_alphanum(text)
    # Convert text to lower
    text = text.lower()
    ## Replace abbreviation word in text to their original form
    text = abbreviation_look_up(text)
    ##Convert apostrophes word to original form 
    text = appos_look_up(text)
    ##Replace all interger occurrences in list of tokenized words with textual representation
    text = replace_numbers(text)
    text = reduce_lengthening(text)
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    ##removes stop words from text
    text = remove_stop_words(text, stop_words=stop_words_list)
    
    
    ## Removed special characters from text
    text = remove_punctuations(text)
    
    # remove html markup
    text = re.sub("(<.*?>)","",text)
    text = correct_word(text)
    
    # # Correct words
    text = remove_repeated_characters(text)
    
    
    

    ## Replace digits to `replace_char for date :
    #text = separate_digit_text(text)

    ## Convert words in text into their root form
    #text = stem_text(text, stemmer='snowball')
    
    #remove_single_char_word
    text = remove_single_char_word(text)
    
    
    
    #text = replace_digits_with_char(text, replace_char='d')
    
    ## Remove urls from text
    text = remove_url(text)
    
    ##Remove alphanumeric words from text
    text = remove_alphanumerics(text)

    
    
    
    ##Separate digit and words with space in text
    #text = separate_digit_text(text)
    
    
    ## Strip multiple whitespaces
    text = remove_extra_space(text)
    # Removing non ASCII chars    
    #text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    return text

In [98]:
correct_word('servic of groop')

'service of group'

In [100]:
transformText(" she' I have so 20 soooooo don't i'm  can't servic going grooooooooop")

'she have twenty soo do not be service go group'

In [101]:
df['text'] = df['text'].map(lambda x: transformText(x))

In [102]:
texts= df['text']
tags= df['label']
# dictionary of lists  
dict = {'text': texts , 'label': tags } 
     
df = pd.DataFrame(dict) 
  
# saving the dataframe 
df.to_csv('Data/DATA_preprocessing_version2.csv')