# Analyzing Text Data

## by Dr. Jatin Kumar Verma

## Preprocessing data using tokenization

In [17]:
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

!pip install -q nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
from nltk.tokenize import sent_tokenize

text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

sent_tokenize_list = sent_tokenize(text)
print(sent_tokenize_list)

from nltk.tokenize import word_tokenize
print (word_tokenize(text))



['Are you curious about tokenization?', "Let's see how it works!", 'We need to analyze a couple of sentences with punctuations to see it in action.']
['Are', 'you', 'curious', 'about', 'tokenization', '?', 'Let', "'s", 'see', 'how', 'it', 'works', '!', 'We', 'need', 'to', 'analyze', 'a', 'couple', 'of', 'sentences', 'with', 'punctuations', 'to', 'see', 'it', 'in', 'action', '.']


## Stemming text data

In [18]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

In [19]:
words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision']




In [21]:
stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL']

stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')

formatted_row = '{:>16}' * (len(stemmers) + 1)
print ('\n', formatted_row.format('WORD', *stemmers), '\n')

for word in words:
   stemmed_words = [stemmer_porter.stem(word),
    stemmer_lancaster.stem(word),
    stemmer_snowball.stem(word)]
   print (formatted_row.format(word, *stemmed_words))


             WORD          PORTER       LANCASTER        SNOWBALL 

           table            tabl            tabl            tabl
        probably         probabl            prob         probabl
          wolves            wolv            wolv            wolv
         playing            play            play            play
              is              is              is              is
             dog             dog             dog             dog
             the             the             the             the
         beaches           beach           beach           beach
        grounded          ground          ground          ground
          dreamt          dreamt          dreamt          dreamt
        envision           envis           envid           envis


## Converting text to its base form using lemmatization


In [23]:
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision']

lemmatizers = ['NOUN LEMMATIZER', 'VERB LEMMATIZER']

lemmatizer_wordnet = WordNetLemmatizer()

formatted_row = '{:>24}' * (len(lemmatizers) + 1)
print ('\n', formatted_row.format('WORD', *lemmatizers), '\n')

for word in words:
   lemmatized_words = [lemmatizer_wordnet.lemmatize(word,
    pos='n'),
     lemmatizer_wordnet.lemmatize(word, pos='v')]
   print (formatted_row.format(word, *lemmatized_words))


[nltk_data] Downloading package wordnet to /root/nltk_data...



                     WORD         NOUN LEMMATIZER         VERB LEMMATIZER 

                   table                   table                   table
                probably                probably                probably
                  wolves                    wolf                  wolves
                 playing                 playing                    play
                      is                      is                      be
                     dog                     dog                     dog
                     the                     the                     the
                 beaches                   beach                   beach
                grounded                grounded                  ground
                  dreamt                  dreamt                   dream
                envision                envision                envision


## Dividing text using chunking

In [25]:
import nltk
nltk.download('brown')

import numpy as np
from nltk.corpus import brown

def splitter(data, num_words):
   words = data.split(' ')
   output = []

   cur_count = 0
   cur_words = []

   for word in words:
     cur_words.append(word)
     cur_count += 1

   if cur_count == num_words:
         output.append(' '.join(cur_words))
         cur_words = []
         cur_count = 0

   output.append(' '.join(cur_words) )
   return output

if __name__=='__main__':

    data = ' '.join(brown.words()[:10000])


    num_words = 1700

    chunks = []
    counter = 0

    text_chunks = splitter(data, num_words)
    print ("Number of text chunks =", len(text_chunks))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Number of text chunks = 1


## Building a text classifier

### tf means term frequency & idf means inverse document frequency


In [29]:
from sklearn.datasets import fetch_20newsgroups

category_map = {'misc.forsale': 'Sales', 'rec.motorcycles': 'Motorcycles', 'rec.sport.baseball': 'Baseball', 'sci.crypt': 'Cryptography', 'sci.space': 'Space'}

training_data = fetch_20newsgroups(subset='train', categories=category_map.keys(), shuffle=True, random_state=7)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_train_termcounts = vectorizer.fit_transform(training_data.data)
print("\nDimensions of training data:", X_train_termcounts.shape)

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer

input_data = [
    "The curveballs of right handed pitchers tend to curve to the left",
        "Caesar cipher is an ancient form of encryption",
            "This two-wheeler is really good on slippery roads"
            ]
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_termcounts)

classifier = MultinomialNB().fit(X_train_tfidf, training_data.target)

X_input_termcounts = vectorizer.transform(input_data)
X_input_tfidf = tfidf_transformer.transform(X_input_termcounts)

predicted_categories = classifier.predict(X_input_tfidf)

for sentence, category in zip(input_data, predicted_categories):
       print('\nInput:', sentence, '\nPredicted category:', category_map[training_data.target_names[category]])



Dimensions of training data: (2968, 40605)

Input: The curveballs of right handed pitchers tend to curve to the left 
Predicted category: Baseball

Input: Caesar cipher is an ancient form of encryption 
Predicted category: Cryptography

Input: This two-wheeler is really good on slippery roads 
Predicted category: Motorcycles


## Analyzing the sentiment of a sentence

In [37]:
import nltk
nltk.download('movie_reviews')

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews


def extract_features(word_list):
    return dict([(word, True) for word in word_list])


positive_fileids = movie_reviews.fileids('pos')
negative_fileids = movie_reviews.fileids('neg')


features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in positive_fileids]
features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in negative_fileids]


threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))


features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]


classifier = NaiveBayesClassifier.train(features_train)


print("\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test))


print("\nTop 10 most informative words:")
for item in classifier.most_informative_features()[:10]:
        print(item[0])


input_reviews = [
            "It is an amazing movie",
                "This is a dull movie. I would never recommend it to anyone.",
                    "The cinematography is pretty great in this movie",
                        "The direction was terrible and the story was all over the place"
                        ]


print("\nPredictions:")
for review in input_reviews:
                            print("\nReview:", review)
                            probdist = classifier.prob_classify(extract_features(review.split()))
                            pred_sentiment = probdist.max()
                            print("Predicted sentiment:", pred_sentiment)
                            print("Probability:", round(probdist.prob(pred_sentiment), 2))


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.



Accuracy of the classifier: 0.735

Top 10 most informative words:
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
affecting
animators

Predictions:

Review: It is an amazing movie
Predicted sentiment: Positive
Probability: 0.61

Review: This is a dull movie. I would never recommend it to anyone.
Predicted sentiment: Negative
Probability: 0.77

Review: The cinematography is pretty great in this movie
Predicted sentiment: Positive
Probability: 0.67

Review: The direction was terrible and the story was all over the place
Predicted sentiment: Negative
Probability: 0.63


## Identifying patterns in text using topic modeling

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from gensim import models, corpora
from nltk.corpus import stopwords

def load_data(input_file):
   data = []
   with open(input_file, 'r') as f:
     for line in f.readlines():
      data.append(line[:-1])
   return data

class Preprocessor(object):

    def __init__(self):

      self.tokenizer = RegexpTokenizer(r'\w+')

      self.stop_words_english = stopwords.words('english')

      self.stemmer = SnowballStemmer('english')

    def process(self, input_text):

         tokens = self.tokenizer.tokenize(input_text.lower())
         tokens_stopwords = [x for x in tokens if not x in self.stop_words_english]

         tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords]


         return tokens_stemmed

if __name__=='__main__':

            input_file = 'data_modeling.txt'

            data = load_data(input_file)


            preprocessor = Preprocessor()


            processed_tokens = [preprocessor.process(x) for x in data]



            dict_tokens = corpora.Dictionary(processed_tokens)


            corpus = [dict_tokens.doc2bow(text) for text in processed_tokens]

            num_topics = 2
            num_words = 4

            ldamodel = models.ldamodel.LdaModel(corpus,  num_topics=num_topics, id2word=dict_tokens,  passes=25)

            print ("Most contributing words to the topics:")
            for item in ldamodel.print_topics(num_topics=num_topics,
               num_words=num_words):
                print( "\nTopic", item[0], "==>", item[1])