---
#**EXTRACTION METHOD**
---

# Install necessary dependencies

In [1]:
import nltk
import numpy as np
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Get Text Document

We use the description of a very popular role-playing game (RPG) Skyrim from
Bethesda Softworks for summarization. 

In [2]:
DOCUMENT = """
The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks.

It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.
"""

In [3]:
import re
DOCUMENT = re.sub(r'\n|\r', ' ', DOCUMENT) #Combining all the paragraphs
DOCUMENT = re.sub(r' +', ' ', DOCUMENT)
DOCUMENT = DOCUMENT.strip()

In [4]:
print(DOCUMENT)

The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks. It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.


Sentences Collection

In [5]:
sentences = nltk.sent_tokenize(DOCUMENT)
len(sentences)

2

In [6]:
sentences

['The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks.',
 'It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.']

# Basic Text pre-processing

In [7]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document) #Function Def Vectorize
norm_sentences = normalize_corpus(sentences)
norm_sentences[:3]

array(['elder scrolls v skyrim action roleplaying video game developed bethesda game studios published bethesda softworks',
       'fifth main installment elder scrolls series following elder scrolls iv oblivion'],
      dtype='<U113')

# *I - UNIGRAM*

##TF

In [22]:
print("Norm Sentences:", norm_sentences)
print("Len of Norm Sentences:", len(norm_sentences))

Norm Sentences: ['elder scrolls v skyrim action roleplaying video game developed bethesda game studios published bethesda softworks'
 'fifth main installment elder scrolls series following elder scrolls iv oblivion']
Len of Norm Sentences: 2


In [27]:
# Unique words
all = " ".join(norm_sentences)
words = nltk.word_tokenize(all)
words = set(words)
words = list(words)
print(type(words))

vectorizer = CountVectorizer(ngram_range =(1, 1))
X1 = vectorizer.fit_transform(norm_sentences) 
features = (vectorizer.get_feature_names())
print((features))

<class 'list'>
<class 'list'>




In [11]:
def computeTF(doc):
  valTF = []
  for each in doc:
    wordDict = dict.fromkeys(words, 0)
    sentence = each.split(" ")

    for word in sentence:
      wordDict[word]+=1

    res = []
    for i in wordDict:
      comp = float(wordDict[i] / len(each))
      res.append(round(comp, 4))
    
    valTF.append(res)
  return(valTF)
    
TF = computeTF(norm_sentences)

In [12]:
TF = np.array(TF)
TF = TF.T
type(TF)

numpy.ndarray

In [13]:
df = pd.DataFrame(TF, index=words)
df.sort_index(ascending=True).head(10)

Unnamed: 0,0,1
action,0.0088,0.0
bethesda,0.0177,0.0
developed,0.0088,0.0
elder,0.0088,0.0253
fifth,0.0,0.0127
following,0.0,0.0127
game,0.0177,0.0
installment,0.0,0.0127
iv,0.0,0.0127
main,0.0,0.0127


##TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
dt_matrix = tv.fit_transform(norm_sentences)
dt_matrix = dt_matrix.toarray() 

In [15]:
td_matrix = dt_matrix.T #Transpose Matrix
td_matrix
print(td_matrix.shape)

(19, 2)


In [16]:
vocab = tv.get_feature_names()
len(vocab)



19

In [17]:
pd.DataFrame(np.round(td_matrix, 2), index=vocab)

Unnamed: 0,0,1
action,0.24,0.0
bethesda,0.48,0.0
developed,0.24,0.0
elder,0.17,0.43
fifth,0.0,0.3
following,0.0,0.3
game,0.48,0.0
installment,0.0,0.3
iv,0.0,0.3
main,0.0,0.3


# *II - BIGRAM* 

In [20]:
# Forming Bigrams
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range = (2,2))
X1 = vectorizer.fit_transform(norm_sentences) 
features = (vectorizer.get_feature_names())
print("\n\nFeatures : \n", features)
print("\n\nX1 : \n", X1.toarray())



Features : 
 ['action roleplaying', 'bethesda game', 'bethesda softworks', 'developed bethesda', 'elder scrolls', 'fifth main', 'following elder', 'game developed', 'game studios', 'installment elder', 'iv oblivion', 'main installment', 'published bethesda', 'roleplaying video', 'scrolls iv', 'scrolls series', 'scrolls skyrim', 'series following', 'skyrim action', 'studios published', 'video game']


X1 : 
 [[1 1 1 1 1 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 1]
 [0 0 0 0 2 1 1 0 0 1 1 1 0 0 1 1 0 1 0 0 0]]




## TF

## TF-IDF

In [21]:
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (2, 2))
X2 = vectorizer.fit_transform(norm_sentences)
scores = (X2.toarray())
print("\n\nScores : \n", scores)



Scores : 
 [[0.2827721  0.2827721  0.2827721  0.2827721  0.20119468 0.
  0.         0.2827721  0.2827721  0.         0.         0.
  0.2827721  0.2827721  0.         0.         0.2827721  0.
  0.2827721  0.2827721  0.2827721 ]
 [0.         0.         0.         0.         0.44943642 0.3158336
  0.3158336  0.         0.         0.3158336  0.3158336  0.3158336
  0.         0.         0.3158336  0.3158336  0.         0.3158336
  0.         0.         0.        ]]
