# AIML Reference-CNN

- <a href = #link_basics>NLP Preprocessing</a>
    - <a href = #link_html>Removing HTML</a>
- <a href = #link_count>CountVectorizer and TfidfVectorizer</a>
- <a href = #link_vader>VaderSentiment</a>
- <a href = #link_count1>Sentiment Analysis</a>
- <a href = #link_mul>Multilabel Review: Amazon food review</a>
- <a href = #link_mul1>Multilabel Review: Airline review project</a>

# <a id = "link_basics"></a>NLP Preprocessing

## <a id = "link_html"></a>HTML Tag Removal

In [1]:
import re, string, unicodedata
import nltk                                   # Natural language processing tool-kit
#import contractions
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet    # Stopwords, and wordnet corpus
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import re, string, unicodedata
import pandas as pd
import nltk           
                        # Natural language processing tool-kit
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from bs4 import BeautifulSoup


def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

In [3]:
dataset = pd.read_csv('corporate_messaging_dfe.csv')

In [4]:
dataset.head(3)

Unnamed: 0,unit_id,golden,unit_state,trusted_judgments,last_judgment_at,category,category_confidence,category_gold,id,screenname,text
0,662822308,False,finalized,3,2015-02-18T04:31:00,Information,1.0,,436528000000000000,Barclays,Barclays CEO stresses the importance of regula...
1,662822309,False,finalized,3,2015-02-18T13:55:00,Information,1.0,,386013000000000000,Barclays,Barclays announces result of Rights Issue http...
2,662822310,False,finalized,3,2015-02-18T08:43:00,Information,1.0,,379580000000000000,Barclays,Barclays publishes its prospectus for its �5.8...


In [5]:
# Here we are going to deal with text data, so we seperate out the text column in a new dataframe: data
data = dataset.drop(['golden', 'unit_state', 'trusted_judgments', 'last_judgment_at', 'category', 'category_confidence', 'category_gold', 'screenname'], axis=1)

In [6]:
data.head(3)

Unnamed: 0,unit_id,id,text
0,662822308,436528000000000000,Barclays CEO stresses the importance of regula...
1,662822309,386013000000000000,Barclays announces result of Rights Issue http...
2,662822310,379580000000000000,Barclays publishes its prospectus for its �5.8...


In [7]:
# First row of data.
pd.set_option('display.max_colwidth', None) # It will enable the entire row visible with truncation of the text. (We can see full text.)
data.loc[[0]]

Unnamed: 0,unit_id,id,text
0,662822308,436528000000000000,Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference http://t.co/Ge9Lp7hpyG


In [8]:
# Removal of the http link using Regular Expression.
for i, row in data.iterrows():
    clean_text = re.sub(r"http\S+", "", data.at[i, 'text'])
    data.at[i,'text'] = clean_text
data.head()

Unnamed: 0,unit_id,id,text
0,662822308,436528000000000000,Barclays CEO stresses the importance of regulatory and cultural reform in financial services at Brussels conference
1,662822309,386013000000000000,Barclays announces result of Rights Issue
2,662822310,379580000000000000,Barclays publishes its prospectus for its �5.8bn Rights Issue:
3,662822311,367530000000000000,Barclays Group Finance Director Chris Lucas is to step down at the end of the week due to ill health
4,662822312,360385000000000000,Barclays announces that Irene McDermott Brown has been appointed as Group Human Resources Director


In [9]:
# Tokenize the words of whole dataframe.
for i, row in data.iterrows():
    text = data.at[i, 'text']
    words = nltk.word_tokenize(text)
    data.at[i,'text'] = words
data.head()

Unnamed: 0,unit_id,id,text
0,662822308,436528000000000000,"[Barclays, CEO, stresses, the, importance, of, regulatory, and, cultural, reform, in, financial, services, at, Brussels, conference]"
1,662822309,386013000000000000,"[Barclays, announces, result, of, Rights, Issue]"
2,662822310,379580000000000000,"[Barclays, publishes, its, prospectus, for, its, �5.8bn, Rights, Issue, :]"
3,662822311,367530000000000000,"[Barclays, Group, Finance, Director, Chris, Lucas, is, to, step, down, at, the, end, of, the, week, due, to, ill, health]"
4,662822312,360385000000000000,"[Barclays, announces, that, Irene, McDermott, Brown, has, been, appointed, as, Group, Human, Resources, Director]"


In [10]:
strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

In [11]:
text1 = """<h1>This is the title</h1>
            <b>This is bold text</b>
            <i>This is italicized Text</i>
            <img src="another html tag"/>
            <a href="Apart from the others"> This is also here!</a>
            “Love all, trust a few, do wrong to none.” 
            ― William Shakespeare, All's Well That Ends Well

            “All the world's a stage,
            And all the men and women merely players;
            They have their exits and their entrances;
            And one man in his time plays many parts,
            His acts being seven ages.” 
            ― William Shakespeare, As You Like It

            "How old are you," asked Jem, "four-and-a-half?"

            "Goin' on seven."

            "Shoot no wonder, then," said Jem, jerking his thumb at me. "Scout yonder's been readin' ever since she was born, 
            and she ain't even started to school yet. You look right puny for goin' on seven."

            "I'm little but I'm old," he said.
            - To Kill a Mockingbird

            Le dîner, Clémence, Anaïs, Raphaël, Voilà !

            something... is! not right() with.,; this :: line.
            
            &nbsp;&nbsp;
            
            11    42   1024   2048
            {{There are double curly braces.}}
            {Here are single curly braces.}
            </body>
            </html>"""

In [12]:
def denoise_text(text):
    text = strip_html_tags(text)
    # Any other step can also be added here according to need e.g we can add code to remove string inside the curly braces.
    return text

In [13]:
text1 = denoise_text(text1)
print(text1)

This is the title
This is bold text
This is italicized Text

 This is also here!
            “Love all, trust a few, do wrong to none.” 
            ― William Shakespeare, All's Well That Ends Well

            “All the world's a stage,
            And all the men and women merely players;
            They have their exits and their entrances;
            And one man in his time plays many parts,
            His acts being seven ages.” 
            ― William Shakespeare, As You Like It

            "How old are you," asked Jem, "four-and-a-half?"

            "Goin' on seven."

            "Shoot no wonder, then," said Jem, jerking his thumb at me. "Scout yonder's been readin' ever since she was born, 
            and she ain't even started to school yet. You look right puny for goin' on seven."

            "I'm little but I'm old," he said.
            - To Kill a Mockingbird

            Le dîner, Clémence, Anaïs, Raphaël, Voilà !

            something... is! not right() with.,; th

In [14]:
#remove accented characters
import unicodedata
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [15]:
remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [16]:
#tokenization
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer=ToktokTokenizer()
text= "The , and , if are stopwords, computer is not"
tokens=tokenizer.tokenize(text)
print(tokens)

['The', ',', 'and', ',', 'if', 'are', 'stopwords', ',', 'computer', 'is', 'not']


In [17]:
print('Number of words is: ', len(tokens))

Number of words is:  11


In [18]:
stopword_list = nltk.corpus.stopwords.words('english')
print(stopword_list)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
stopword_list.remove('no')
stopword_list.remove('not')

In [20]:
stopword_list.append('pep')

In [21]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [22]:
text= "The , and , if are stopwords, computer is not"
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]

In [23]:
tokens

['The',
 ',',
 'and',
 ',',
 'if',
 'are',
 'stopwords',
 ',',
 'computer',
 'is',
 'not']

In [24]:
filtered_tokens=[token for token in tokens if token not in stopword_list]
filtered_tokens

['The', ',', ',', 'stopwords', ',', 'computer', 'not']

In [25]:
#remove special characters
import re
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

In [26]:
#stemming
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

In [27]:
#lemmatization

#uncomment the below two lines to install spacy and download the language model
# !pip install spacy
#!python -m spacy download en_core_web_sm


import spacy
nlp = spacy.load('en_core_web_sm')
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'my system keep crash ! his crashed yesterday , ours crash daily'

In [28]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []                        # Create empty list to store pre-processed words.
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)        # Append processed words to new list.
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []                        # Create empty list to store pre-processed words.
    for word in words:
        new_word = word.lower()           # Converting to lowercase
        new_words.append(new_word)        # Append processed words to new list.
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []                        # Create empty list to store pre-processed words.
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)    # Append processed words to new list.
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []                        # Create empty list to store pre-processed words.
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)        # Append processed words to new list.
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []                            # Create empty list to store pre-processed words.
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)                # Append processed words to new list.
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []                           # Create empty list to store pre-processed words.
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)              # Append processed words to new list.
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

In [29]:
text1

'This is the title\nThis is bold text\nThis is italicized Text\n\n This is also here!\n            “Love all, trust a few, do wrong to none.” \n            ― William Shakespeare, All\'s Well That Ends Well\n\n            “All the world\'s a stage,\n            And all the men and women merely players;\n            They have their exits and their entrances;\n            And one man in his time plays many parts,\n            His acts being seven ages.” \n            ― William Shakespeare, As You Like It\n\n            "How old are you," asked Jem, "four-and-a-half?"\n\n            "Goin\' on seven."\n\n            "Shoot no wonder, then," said Jem, jerking his thumb at me. "Scout yonder\'s been readin\' ever since she was born, \n            and she ain\'t even started to school yet. You look right puny for goin\' on seven."\n\n            "I\'m little but I\'m old," he said.\n            - To Kill a Mockingbird\n\n            Le dîner, Clémence, Anaïs, Raphaël, Voilà !\n\n            so

In [30]:
words = nltk.word_tokenize(text1)     # list of words.
print(words)
print('Number of words is: ', len(words))

['This', 'is', 'the', 'title', 'This', 'is', 'bold', 'text', 'This', 'is', 'italicized', 'Text', 'This', 'is', 'also', 'here', '!', '“', 'Love', 'all', ',', 'trust', 'a', 'few', ',', 'do', 'wrong', 'to', 'none.', '”', '―', 'William', 'Shakespeare', ',', 'All', "'s", 'Well', 'That', 'Ends', 'Well', '“', 'All', 'the', 'world', "'s", 'a', 'stage', ',', 'And', 'all', 'the', 'men', 'and', 'women', 'merely', 'players', ';', 'They', 'have', 'their', 'exits', 'and', 'their', 'entrances', ';', 'And', 'one', 'man', 'in', 'his', 'time', 'plays', 'many', 'parts', ',', 'His', 'acts', 'being', 'seven', 'ages.', '”', '―', 'William', 'Shakespeare', ',', 'As', 'You', 'Like', 'It', '``', 'How', 'old', 'are', 'you', ',', "''", 'asked', 'Jem', ',', '``', 'four-and-a-half', '?', "''", '``', 'Goin', "'", 'on', 'seven', '.', "''", '``', 'Shoot', 'no', 'wonder', ',', 'then', ',', "''", 'said', 'Jem', ',', 'jerking', 'his', 'thumb', 'at', 'me', '.', '``', 'Scout', 'yonder', "'s", 'been', 'readin', "'", 'ever',

In [31]:
words = normalize(words)
print(words)
print('Number of words is: ', len(words))

['title', 'bold', 'text', 'italicized', 'text', 'also', 'love', 'trust', 'wrong', 'none', 'william', 'shakespeare', 'well', 'ends', 'well', 'world', 'stage', 'men', 'women', 'merely', 'players', 'exits', 'entrances', 'one', 'man', 'time', 'plays', 'many', 'parts', 'acts', 'seven', 'ages', 'william', 'shakespeare', 'like', 'old', 'asked', 'jem', 'fourandahalf', 'goin', 'seven', 'shoot', 'wonder', 'said', 'jem', 'jerking', 'thumb', 'scout', 'yonder', 'readin', 'ever', 'since', 'born', 'ai', 'nt', 'even', 'started', 'school', 'yet', 'look', 'right', 'puny', 'goin', 'seven', 'little', 'old', 'said', 'kill', 'mockingbird', 'le', 'diner', 'clemence', 'anais', 'raphael', 'voila', 'something', 'right', 'line', '11', '42', '1024', '2048', 'double', 'curly', 'braces', 'single', 'curly', 'braces']
Number of words is:  88


In [32]:
# Iterate the normalize funtion over whole data.
for i, row in data.iterrows():
    words = data.at[i, 'text']
    words = normalize(words)
    data.at[i,'text'] = words

In [33]:
data.head()

Unnamed: 0,unit_id,id,text
0,662822308,436528000000000000,"[barclays, ceo, stresses, importance, regulatory, cultural, reform, financial, services, brussels, conference]"
1,662822309,386013000000000000,"[barclays, announces, result, rights, issue]"
2,662822310,379580000000000000,"[barclays, publishes, prospectus, 58bn, rights, issue]"
3,662822311,367530000000000000,"[barclays, group, finance, director, chris, lucas, step, end, week, due, ill, health]"
4,662822312,360385000000000000,"[barclays, announces, irene, mcdermott, brown, appointed, group, human, resources, director]"


In [34]:
import nltk
nltk.download('wordnet')
def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

stems, lemmas = stem_and_lemmatize(words)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jimmy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Stemmed:
 ['z', 'bhutt', 'problem', 'food', 'amp', 'land', 'system', 'includ', 'land', 'acqu', 'commod', 'spec', 'affect', 'food', 'pric', 'amp', 'lack', 'discuss', 'nins2013']

Lemmatized:
 ['z', 'bhutta', 'problems', 'food', 'amp', 'land', 'systems', 'include', 'land', 'acquistion', 'commodity', 'speculation', 'affect', 'food', 'price', 'amp', 'lack', 'discussion', 'nins2013']


# <a id = "link_count"></a>CountVectorizer and TfidfVectorizer

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.dog"]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 2 1 1 1 1 1 2]]


In [36]:
# encode another document
text2 = ["the puppy dog"]
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 1 0 0 0 0 0 1]]


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]
(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


# <a id = "link_vader"></a>VaderSentiment

In [38]:
from textblob import TextBlob
text=TextBlob("I am happy")
print(text.sentiment)

Sentiment(polarity=0.8, subjectivity=1.0)


In [39]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
text=SentimentIntensityAnalyzer()
score=text.polarity_scores("I am happy")
print(score)


{'neg': 0.0, 'neu': 0.351, 'pos': 0.649, 'compound': 0.5719}


In [40]:
text1=TextBlob("I am sad")

In [41]:
text1=SentimentIntensityAnalyzer()
score=text1.polarity_scores("I am sad")
print(score)


{'neg': 0.608, 'neu': 0.392, 'pos': 0.0, 'compound': -0.4767}


# <a id = "link_count1"></a>Sentiment Analysis

In [42]:
import pandas as pd       
train = pd.read_csv("labeledTrainData.tsv", header=0,delimiter="\t", quoting=3)

FileNotFoundError: [Errno 2] File labeledTrainData.tsv does not exist: 'labeledTrainData.tsv'

In [None]:
train.shape

In [None]:
train.columns.values

In [None]:
print (train["review"][0])

In [None]:
example1 = BeautifulSoup(train["review"][0])

In [None]:
print (train["review"][0])
print (example1.get_text())

In [None]:
import re
letters_only = re.sub("[^a-zA-Z]"," ",example1.get_text() )
print (letters_only)

In [None]:
lower_case = letters_only.lower() 

In [None]:
words = lower_case.split()

In [None]:
from nltk.corpus import stopwords
print (stopwords.words("english")) 

In [None]:
words = [w for w in words if not w in stopwords.words("english")]
print (words)

In [None]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [None]:
clean_review = review_to_words( train["review"][0] )
print (clean_review)

In [None]:
# Get the number of reviews based on the dataframe column size
num_reviews = train["review"].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length
# of the movie review list 
for i in range( 0, num_reviews ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_reviews.append( review_to_words( train["review"][i] ) )

In [None]:
clean_train_reviews[0]

In [None]:
clean_train_reviews[24000]

In [None]:
print ("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

In [None]:
print (train_data_features.shape)

In [None]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print (vocab)

In [None]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (count, tag)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(verbose=2,n_jobs=-1,n_estimators = 10) 
# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
print ("Training the random forest...")
forest = forest.fit( train_data_features, train["sentiment"] )
# random forest performance through cross vaidation 
print (forest)
print (np.mean(cross_val_score(forest,train_data_features,train["sentiment"],cv=10)))

In [None]:
# Read the test data
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", \
                   quoting=3 )

# Verify that there are 25,000 rows and 2 columns
print (test.shape)

# Create an empty list and append the clean reviews one by one
num_reviews = len(test["review"])
clean_test_reviews = [] 

print ("Cleaning and parsing the test set movie reviews...\n")
for i in range(0,num_reviews):
    if( (i+1) % 1000 == 0 ):
        print ("Review %d of %d\n" % (i+1, num_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# Use the random forest to make sentiment label predictions
result = forest.predict(test_data_features)
print (result)

# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )

# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3 )

# <a id = "link_mul"></a>Multilabel Review: Amazon food review

In [None]:
import re, string, unicodedata                          # Import Regex, string and unicodedata.
from bs4 import BeautifulSoup                           # Import BeautifulSoup.

import numpy as np                                      # Import numpy.
import pandas as pd                                     # Import pandas.
import nltk                                             # Import Natural Language Tool-Kit.

nltk.download('stopwords')                              # Download Stopwords.
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords                       # Import stopwords.
from nltk.tokenize import word_tokenize, sent_tokenize  # Import Tokenizer.
from nltk.stem.wordnet import WordNetLemmatizer         # Import Lemmatizer.

In [None]:
data = pd.read_csv("Reviews.csv")

In [None]:
data.shape

In [None]:
data.head()

In [None]:
# Taking only 50000 entries for demonstration purpose. As full data will take more time to process.
# Only keeping score and Text columns from the data, as these are useful for our analysis.

data = data.loc[:49999, ['Score', 'Text']]

In [None]:
data.isnull().sum(axis=0)                                # Check for NULL values.

In [None]:
pd.set_option('display.max_colwidth', None) # Display full dataframe information (Non-turncated Text column.)

data.head()                                 # Check first 5 rows of data

In [None]:
data.shape                                # Shape of data

### Data Pre-processing:

- Remove html tags.
- Replace contractions in string. (e.g. replace I'm --> I am) and so on.\
- Remove numbers.
- Tokenization
- To remove Stopwords.
- Lemmatized data

We have used NLTK library to tokenize words , remove stopwords and lemmatize the remaining words.

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['Text'] = data['Text'].apply(lambda x: strip_html(x))
data.head()

In [None]:
def remove_numbers(text):
  text = re.sub(r'\d+', '', text)
  return text

data['Text'] = data['Text'].apply(lambda x: remove_numbers(x))
data.head()

In [None]:
data['Text'] = data.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1) # Tokenization of data

In [None]:
data.head() 

In [None]:
stopwords = stopwords.words('english')

customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
        "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
        "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# Set custom stop-word's list as not, couldn't etc. words matter in Sentiment, so not removing them from original data.

stopwords = list(set(stopwords) - set(customlist))

In [None]:
lemmatizer = WordNetLemmatizer()

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords:
            new_words.append(word)
    return new_words

def lemmatize_list(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    words = lemmatize_list(words)
    return ' '.join(words)

data['Text'] = data.apply(lambda row: normalize(row['Text']), axis=1)
data.head()

In [None]:
# Vectorization (Convert text data to numbers).
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000)                # Keep only 1000 features as number of features will increase the processing time.
data_features = vectorizer.fit_transform(data['Text'])

data_features = data_features.toarray()                        # Convert the data features to array.

In [None]:
data_features.shape

In [None]:
labels = data['Score']
labels = labels.astype('int')

In [None]:
# Split data into training and testing set.

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_features, labels, test_size=0.3, random_state=7)

In [None]:
# Using Random Forest to build model for the classification of reviews.
# Also calculating the cross validation score.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=10, n_jobs=4)

forest = forest.fit(X_train, y_train)

print(forest)

print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

In [None]:
# Predict the result for test data using the model built above.

result = forest.predict(X_test)

In [None]:
# Print and plot Confusion matirx to get an idea of how the distribution of the prediction is, among all the classes.

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, result)

print(conf_mat)

df_cm = pd.DataFrame(conf_mat, index = [i for i in "12345"],
                  columns = [i for i in "12345"])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='g')

In [None]:
# Using TfidfVectorizer to convert text data to numbers.

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
data_features = vectorizer.fit_transform(data['Text'])

data_features = data_features.toarray()

data_features.shape

In [None]:
# Using Random Forest to build model for the classification of reviews.
# Also calculating the cross validation score.

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import numpy as np

forest = RandomForestClassifier(n_estimators=10, n_jobs=4)

forest = forest.fit(X_train, y_train)

print(forest)

print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

In [None]:
result = forest.predict(X_test)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, result)

df_cm = pd.DataFrame(conf_mat, index = [i for i in "12345"],
                  columns = [i for i in "12345"])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='g')

# <a id = "link_mul1"></a>Multilabel Review: Airline review project

### Problem Description
A sentiment analysis job about the problems of each major U.S. airline. Twitter data was scraped from February of 2015 and contributors were asked to first classify positive, negative, and neutral tweets, followed by categorizing negative reasons (such as "late flight" or "rude service").

In [None]:
import re, string, unicodedata                        
from bs4 import BeautifulSoup                         

import numpy as np                                    
import pandas as pd                                   
import nltk                                           

nltk.download('stopwords')                            
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords                     
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer   

In [None]:
data = pd.read_csv("Tweets.csv")

In [None]:
data.head()

In [None]:
data = data.loc[:, ['airline_sentiment', 'text']]

In [None]:
pd.set_option('display.max_colwidth', None)
data.head()

In [None]:
data.groupby("airline_sentiment").agg({'airline_sentiment': 'count'})

### Remove HTML

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['text'] = data['text'].apply(lambda x: strip_html(x))
data.head()

### Remove Numbers and Special Characters

In [None]:
import re
data['text'] = data['text'].apply(lambda x:  re.sub("[^a-zA-Z]"," ",x ))

### Convert to lower case

In [None]:
data['text'] = data['text'].apply(lambda x:x.lower())

In [None]:
data['text'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1) 

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_list(words):
    new_words = []
    for word in words:
      new_words.append(lemmatizer.lemmatize(word, pos='v'))
    return new_words

In [None]:
def lemmatize(words):
    #words = remove_stopwords(words)
    words = lemmatize_list(words)
    return ' '.join(words)

In [None]:
data['text'] = data.apply(lambda row: lemmatize(row['text']), axis=1)
data.head()

In [None]:
data['text'] = data.apply(lambda row: nltk.word_tokenize(row['text']), axis=1) 
data.head()

In [None]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []                        # Create empty list to store pre-processed words.
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)        # Append processed words to new list.
   # return new_words
    return ' '.join(new_words)

In [None]:
data['text'] = data.apply(lambda row: remove_stopwords(row['text']), axis=1) 
data.head()

## CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)    
data_features = vectorizer.fit_transform(data['text'])             

In [None]:
labels = data['airline_sentiment']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_features, labels, test_size=0.3, random_state=7)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=100, n_jobs=8)

forest = forest.fit(X_train, y_train)

print(forest)

print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

In [None]:
# Predict the result for test data using the model built above.

result = forest.predict(X_test)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
def draw_cm( actual, predicted ):
    cm = confusion_matrix( y_test, result,["negative", "neutral", "positive"] )
    sns.heatmap(cm, annot=True,  fmt='.2f', xticklabels = ["negative", "neutral", "positive"] , yticklabels = ["negative", "neutral", "positive"] )
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
draw_cm( y_test, result )

## TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
data_features = vectorizer.fit_transform(data['text'])

data_features = data_features.toarray()

data_features.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

import numpy as np

forest = RandomForestClassifier(n_estimators=100, n_jobs=8)

forest = forest.fit(X_train, y_train)

print(forest)

print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

In [None]:
result = forest.predict(X_test)

In [None]:
draw_cm( y_test, result )