In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
#
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)
print("import done")

Using TensorFlow backend.


import done


In [52]:
import os
DATA_FILE = os.path.abspath('DATA/DATA_not_ameliorate.csv')
df = pd.read_csv(DATA_FILE,delimiter=';',encoding='UTF-8')
print(df.head())

                                                text     label
0  I had ordered a data cable, got a very well fi...  NOTISSUE
1                                   Love This Phone.  NOTISSUE
2                I get a very well finished product.  NOTISSUE
3                             I couldn't be happier.  NOTISSUE
4  I was looking for this headset for a long time...  NOTISSUE


In [26]:
df['label'].value_counts() #balanced Dataset


NOTISSUE    2028
ISSUE       2027
Name: label, dtype: int64

In [27]:
df.shape

(4055, 2)

### Preprocessing the Data

In [28]:
# List stop words 
stop_words_list={
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 #'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 #'became',
 'because',
 #'become',
 #'becomes',
 #'becoming',
 #'been',
 'before',
 'beforehand',
 #'behind',
 #'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 #'call',
 'can',
 'cannot',
 'could',
 'did',
 #'do',
 #'does',
 #'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 #'everything',
 'everywhere',
 'except',
 'few',
 'first',
 'for',
 'former',
 'formerly',
 'from',
 'front',
 'full',
 'further',
 #'had',
 #'has',
 #'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 #'is',
 'it',
 'its',
 'itself',
 'just',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 #'made',
 #'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 #'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'nevertheless',
 'next',
 'noone',
 'nor',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 #'say',
 #'see',
 #'seem',
 #'seemed',
 #'seeming',
 #'seems',
 'serious',
 'several',
 'she',
 'should',
 #'show',
 'side',
 'since',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 #'take',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 #'toward',
 #'towards',
 'twelve',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 #'used',
 'using',
 'various',
 'very',
 'via',
 #'was',
 'we',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [29]:
appos_dict = {
    "can't": "cannot",
    "cant": "cannot",
    "aren't": "are not",
    "arent": "are not",
    "couldn't": "could not",
    "couldnt": "could not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont": "do not",
    "hadn't": "had not",
    "hadnt": "had not",
    "hasn't": "has not",
    "hasnt": "has not",
    "haven't": "have not",
    "havent": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "I would",
    "i'll": "I will",
    "i'm": "I am",
    "im": "I am",
    "isn't": "is not",
    "isnt": "is not",
    "it's": "it is",
    "it'll": "it will",
    "i've": "I have",
    "let's": "let us",
    "mightn't": "might not",
    "mightnt": "might not",
    "mustn't": "must not",
    "mustnt": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't": "should not",
    "shouldnt": "should not",
    "that's": "that is",
    "thats": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "weren't": "were not",
    "we've": "we have",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "whats": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "wont": "will not",
    "wouldn't": "would not",
    "wouldnt": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have",
    "wasn't": "was not",
    "wasnt": "was not",
    "we'll": "will",
    "didn't": "did not",
    "didnt": "did not"
}

In [30]:
abbreviation_dict= {
    "can't": "cannot",
    "cant": "cannot",
    "aren't": "are not",
    "arent": "are not",
    "couldn't": "could not",
    "couldnt": "could not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont": "do not",
    "hadn't": "had not",
    "hadnt": "had not",
    "hasn't": "has not",
    "hasnt": "has not",
    "haven't": "have not",
    "havent": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "i'd": "I would",
    "i'll": "I will",
    "i'm": "I am",
    "im": "I am",
    "isn't": "is not",
    "isnt": "is not",
    "it's": "it is",
    "it'll": "it will",
    "i've": "I have",
    "let's": "let us",
    "mightn't": "might not",
    "mightnt": "might not",
    "mustn't": "must not",
    "mustnt": "must not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "shouldn't": "should not",
    "shouldnt": "should not",
    "that's": "that is",
    "thats": "that is",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "we'd": "we would",
    "we're": "we are",
    "weren't": "were not",
    "we've": "we have",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "whats": "what is",
    "what've": "what have",
    "where's": "where is",
    "who'd": "who would",
    "who'll": "who will",
    "who're": "who are",
    "who's": "who is",
    "who've": "who have",
    "won't": "will not",
    "wont": "will not",
    "wouldn't": "would not",
    "wouldnt": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have",
    "wasn't": "was not",
    "wasnt": "was not",
    "we'll": "will",
    "didn't": "did not",
    "didnt": "did not"
}

In [31]:
import nltk
import inflect
from nltk.stem import WordNetLemmatizer
import gensim
from gensim import parsing
from gensim.parsing.preprocessing import split_alphanum
from spellchecker import SpellChecker
import re

In [11]:
##Convert apostrophes word to original form
def replace_word(word):
    word = word.lower()
    word = word.split()
    for i in range(len(word)):
        word[i] = abbreviation_dict.get(word[i], word[i])
    word = " ".join(word)
    return word

##Fixing Word Lengthening
##https://rustyonrampa"ge.github.io/text-mining/2017/11/28/spelling-correction-with-python-and-nltk.html
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def replace_numbers(word):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    word = word.split()
    for i in range(len(word)):
        if word[i].isdigit():
            word[i] = p.number_to_words(word[i])
    word = " ".join(word)
    return word
def transformText(text):
    text = split_alphanum(text)
    # Convert text to lower
    text = text.lower()
    text = replace_word(text)
    text = replace_numbers(text)
    text = reduce_lengthening(text)
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in STOP_WORDS]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    # remove html markup
    text = re.sub("(<.*?>)","",text)
    # Correct words
    spell = SpellChecker()
    misspelled = text.split()
    wordnet_lemmatizer = WordNetLemmatizer()
    for i in range(len(misspelled)):
        # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v")
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n")
    text = " ".join(misspelled)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Strip all the numerics
    #text = gensim.parsing.preprocessing.strip_numeric(text)
    return text

In [12]:
transformText(" she'd've I have so 20 soooooo don't i'm  can't servic going grooooooooop")

'have have twenty soo do not be service go group'

In [13]:
df['text'] = df['text'].map(lambda x: transformText(x))

In [14]:
texts= df['text']
tags= df['label']
# dictionary of lists  
dict = {'text': texts , 'label': tags } 
     
df = pd.DataFrame(dict) 
  
# saving the dataframe 
df.to_csv('Data/DATA_preprocessing_brute.csv')

In [15]:
df.head()

Unnamed: 0,text,label
0,have order data cable get well finish work pro...,NOTISSUE
1,love phone,NOTISSUE
2,get well finish product,NOTISSUE
3,not be happier,NOTISSUE
4,be look headset long time have get,NOTISSUE


In [None]:

"""def emoticons_look_up(text):
    """
    """Remove emoticons from text and returns list of emotions present in text
    #Example: Sure, you are welcome :) => Sure, you are welcome.
    Args:
        text (str): text
    Returns:
        text (str): text with removed emoticons sign
        emolist (list) : list of emotions from text
    """

    """words = text.split()
    emolist = []
    for word in words:
        if word in emo:
            emolist.append(str(emo[word]))
            text = text.replace(word," ")
    return text, emolist"""

In [106]:
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk import word_tokenize
sb_stem = SnowballStemmer("english", ignore_stopwords=True)
pt_stem = PorterStemmer()
lmtzr = WordNetLemmatizer()

##Convert apostrophes word to original form
def replace_numbers(word):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    word = word.split()
    for i in range(len(word)):
        if word[i].isdigit():
            word[i] = p.number_to_words(word[i])
    word = " ".join(word)
    return word



"""  Fixing Word Lengthening
##https://rustyonrampa"ge.github.io/text-mining/2017/11/28/spelling-correction-with-python-and-nltk.html"""
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

def abbreviation_look_up(text):
    """
    Replace abbreviation word in text to their original form
    Example: hi, thanq so mch => hi, thank you so much
    Args:
        text (str): text
    Returns:
        slanged (str): cleaned text with replaced slang
    """
    words = text.split()
    new_text = []

    for word in words:
        word_s = word.lower()
        if word_s in abbreviation_dict:
            new_text.append(abbreviation_dict[word_s])
        else:
            new_text.append(word)
    slanged = " ".join(new_text)
    return slanged

def appos_look_up(text):
    """
    Convert apostrophes word to original form
    Example: I don't know what is going on?  => I do not know what is going on? 
    Args:
        text (str): text 
    Returns:
        apposed (str) : text with converted apostrophes
    """
    words = text.split()
    new_text = []
    for word in words:
        word_s = word.lower()
        if word_s in appos_dict:
            new_text.append(appos_dict[word_s])
        else:
            new_text.append(word)
    apposed = " ".join(new_text)
    return apposed


def correct_word(text):
    # Correct words
    text="servic groop"
    spell = SpellChecker()
    misspelled = text.split()
    wordnet_lemmatizer = WordNetLemmatizer()
    for i in range(len(misspelled)):
        # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="v")
        misspelled[i] = wordnet_lemmatizer.lemmatize(misspelled[i], pos="n")
        text = " ".join(misspelled)
    return text


def remove_repeated_characters(text):
    """
    Remove repeated characters (>2) in words to max limit of 2
    Example: I am verrry happpyyy today => I am verry happyy today
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with removed repeated chars
    """
    regex_pattern = re.compile(r'(.)\1+')
    clean_text = regex_pattern.sub(r'\1\1', text)
    return clean_text


def separate_digit_text(text):
    """
    Separate digit and words with space in text
    Example: I will be booking tickets for 2adults => I will be booking tickets for 2 adults   
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with separated digits and words
    """
    regex_patter = re.compile(r'([\d]+)([a-zA-Z]+)')
    clean_text = regex_patter.sub(r'\1 \2', text)
    return clean_text




def stem_text(text, stemmer='snowball'):
    """
    Convert words in text into their root form
    Example: I am playing in ground => I am play in ground 
    Args:
        text (str): text
        
    Returns:
        text_stem (str): cleaned text with replaced stem words
    """
    #text = remove_inside_braces(text)
    tokens = word_tokenize(text)
    if stemmer == 'snowball':
        text_stem = " ".join([sb_stem.stem(w) for w in tokens])
    else:
        text_stem = " ".join([pt_stem.stem(w) for w in tokens])
    
    return text_stem


def remove_single_char_word(text):
    """
    Remove single character word from text
    Example: I am in a home for 2 years => am in home for years 
    Args:
        text (str): text
         
    Returns:
        (str): text with single char removed
    """
    words = text.split()
    filter_words = [word for word in words if len(word) > 1]
    return " ".join(filter_words)


def remove_punctuations(text):
    """
    Removed special characters from text
    Example: he: I am going. are you coming? => he I am going. are you coming
   
    Args:
        text (str): text
   
    Returns:
        clean_text (str): cleaned text with removed special characters
    """
    regex_pattern = re.compile(r'[\,+\:\?\!\"\(\)!\'\.\%\[\]]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text


def remove_extra_space(text):
    """
    Remove extra white spaces space from text
    Example: hey are   you coming. ? => he are you coming. ?
    Args:
        text (str): text
    Returns:
        clean_text (str): clean text with removed extra white spaces
    """
    #text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    clean_text = ' '.join(text.strip().split())
    return clean_text


def replace_digits_with_char(text, replace_char='d'):
    """
    Replace digits to `replace_char`
    Example: I will be there on 22 april. => I will be there on dd april.
    Args:
        text (str): text
        replace_char (str): character with which digit has to be replaced
    Returns:
        clean_text (str): clean text with replaced char for digits
    """
    regex_pattern = re.compile(r'[0-9]')
    clean_text = regex_pattern.sub(replace_char, text)
    return clean_text




def remove_url(text):
    """
    Remove urls from text
    Example: link to latest cricket score. https://xyz.com/a/b => link to latest cricket score.
    Args:
        text (str): text
    Returns:
        text (str): text with removed urls
    """

    urlfree = []
    for word in text.split():
        if not word.startswith("www"):
            urlfree.append(word)
        elif not word.startswith("http"):
            urlfree.append(word)
        elif not word.endswith(".html"):
            urlfree.append(word)
    urlfree = " ".join(urlfree)

    urls = re.finditer(r'http[\w]*:\/\/[\w]*\.?[\w-]+\.+[\w]+[\/\w]+', urlfree)
    for i in urls:
        urlfree = re.sub(i.group().strip(), '', urlfree)
    return urlfree


def remove_alphanumerics(text):
    """
    Remove alphanumeric words from text
    Example: hello man whatsup123 => hello man
    Args:
        text (str): text
    Returns:
        text (str): text with removed alphanumeric words
    """
    txt = []
    for each in text.split():
        if not any(x in each.lower() for x in "0123456789"):
            txt.append(each)
    txtsent = " ".join(txt)
    return txtsent 


def remove_words_start_with(text, starts_with_char):
    """
    Remove words start with character `starts_with_char`
    Example: dhoni rocks with last ball six #dhoni #six => dhoni rocks with last ball six (start_char_with='#')
    Args:
        text (str): text
        starts_with_char (str): starting characters of word, which to be removed from text
    Returns:
        text (str): text with removed words start with given chars
    """
    urls = re.finditer(starts_with_char + r'[A-Za-z0-9\w]*', text)
    for i in urls:
        text = re.sub(i.group().strip(), '', text)
    return text.strip()

def remove_stop_words(text, stop_words=stop_words_list):
    """
    This function removes stop words from text
    Example: I am very excited for today's football match => very excited today's football match
    Params
        text (str) :text on which processing needs to done
        stop_words (list) : stop words which needs to be removed
    Returns
        text(str): text after stop words removal
    """
    stop_words = set(stop_words)
    split_list = text.split(" ")
    split_list = [word for word in split_list if word not in stop_words]
    return " ".join(split_list)

In [107]:
def transformtext(text):
    text = split_alphanum(text)
    # Convert text to lower
    text = text.lower()
    
    ##Replace all interger occurrences in list of tokenized words with textual representation
    text = replace_numbers(text)
    
    #text = reduce_lengthening(text)

    text = remove_repeated_characters(text)
    
    ## Replace digits to `replace_char for date :
    #text = separate_digit_text(text)

    ## Replace slang word in text to their original form
    text = abbreviation_look_up(text)
    
    ##Convert apostrophes word to original form 
    text = appos_look_up(text)
   
    ## # Correct words
    text = correct_word(text)
   
    ## Convert words in text into their root form
    text = stem_text(text, stemmer='snowball')
    
    #remove_single_char_word
    text = remove_single_char_word(text)
    
    ## Removed special characters from text
    text = remove_punctuations(text)
    
    ## Strip multiple whitespaces
    text = remove_extra_space(text)
    
    text = replace_digits_with_char(text, replace_char='d')
    
    ## Remove urls from text
    text = remove_url(text)
    
    ##Remove alphanumeric words from text
    text = remove_alphanumerics(text)

    #text = remove_words_start_with(text, starts_with_char)
    ##removes stop words from text
    text = remove_stop_words(text, stop_words=stop_words_list)
    
    ##Separate digit and words with space in text
    text = separate_digit_text(text)
    
    # remove html markup
    text = re.sub("(<.*?>)","",text)
    
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    return text


In [109]:
transformtext("goining")

'servic group'

In [81]:
df['text'] = df['text'].map(lambda x: transformText(x))

KeyboardInterrupt: 