---
#**EXTRACTION METHOD**
---

# Install necessary dependencies

In [1]:
import nltk
import numpy as np
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Get Text Document

We use the description of a very popular role-playing game (RPG) Skyrim from
Bethesda Softworks for summarization. 

In [2]:
DOCUMENT = """
The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks.

It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.

The game's main story revolves around the player character's quest to defeat Alduin the World-Eater, 
a dragon who is prophesied to destroy the world. The game is set 200 years after the events of Oblivion 
and takes place in the fictional province of Skyrim. Over the course of the game, the player completes 
quests and develops the character by improving skills. The game continues the open-world tradition of 
its predecessors by allowing the player to travel anywhere in the game world at any time, and to ignore 
or postpone the main storyline indefinitely.

The team opted for a unique and more diverse open world than Oblivion's Imperial Province of Cyrodiil, 
which game director and executive producer Todd Howard considered less interesting by comparison. 
The game was released to critical acclaim, with reviewers particularly mentioning the character advancement 
and setting, and is considered to be one of the greatest video games of all time.
"""

In [26]:
import re
DOCUMENT = re.sub(r'\n|\r', ' ', DOCUMENT) #Combining all the paragraphs
DOCUMENT = re.sub(r' +', ' ', DOCUMENT)
DOCUMENT = DOCUMENT.strip()

In [23]:
print(DOCUMENT)

The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks.  It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.  The game's main story revolves around the player character's quest to defeat Alduin the World-Eater,  a dragon who is prophesied to destroy the world. The game is set 200 years after the events of Oblivion  and takes place in the fictional province of Skyrim. Over the course of the game, the player completes  quests and develops the character by improving skills. The game continues the open-world tradition of  its predecessors by allowing the player to travel anywhere in the game world at any time, and to ignore  or postpone the main storyline indefinitely.  The team opted for a unique and more diverse open world than Oblivion's Imperial Province of Cyrodiil,  which game director and executive producer Todd Howard considered less interesting by co

Sentences Collection

In [24]:
sentences = nltk.sent_tokenize(DOCUMENT)
len(sentences)

8

In [25]:
sentences

['The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks.',
 'It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.',
 "The game's main story revolves around the player character's quest to defeat Alduin the World-Eater,  a dragon who is prophesied to destroy the world.",
 'The game is set 200 years after the events of Oblivion  and takes place in the fictional province of Skyrim.',
 'Over the course of the game, the player completes  quests and develops the character by improving skills.',
 'The game continues the open-world tradition of  its predecessors by allowing the player to travel anywhere in the game world at any time, and to ignore  or postpone the main storyline indefinitely.',
 "The team opted for a unique and more diverse open world than Oblivion's Imperial Province of Cyrodiil,  which game director and executive producer Todd Howard considered

# Basic Text pre-processing

In [7]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document) #Function Def Vectorize
norm_sentences = normalize_corpus(sentences)
norm_sentences[:3]

array(['elder scrolls v skyrim action roleplaying video game developed bethesda game studios published bethesda softworks',
       'fifth main installment elder scrolls series following elder scrolls iv oblivion',
       'games main story revolves around player characters quest defeat alduin worldeater dragon prophesied destroy world'],
      dtype='<U157')

# **UNIGRAM**

##TF

In [8]:
norm_sentences

array(['elder scrolls v skyrim action roleplaying video game developed bethesda game studios published bethesda softworks',
       'fifth main installment elder scrolls series following elder scrolls iv oblivion',
       'games main story revolves around player characters quest defeat alduin worldeater dragon prophesied destroy world',
       'game set years events oblivion takes place fictional province skyrim',
       'course game player completes quests develops character improving skills',
       'game continues openworld tradition predecessors allowing player travel anywhere game world time ignore postpone main storyline indefinitely',
       'team opted unique diverse open world oblivions imperial province cyrodiil game director executive producer todd howard considered less interesting comparison',
       'game released critical acclaim reviewers particularly mentioning character advancement setting considered one greatest video games time'],
      dtype='<U157')

In [9]:
len(norm_sentences)

8

In [10]:
all = " ".join(norm_sentences)
words = nltk.word_tokenize(all)
words = set(words)
words = list(words)
len(words)

87

In [11]:
def computeTF(doc):
  valTF = []
  for each in doc:
    wordDict = dict.fromkeys(words, 0)
    sentence = each.split(" ")

    for word in sentence:
      wordDict[word]+=1

    res = []
    for i in wordDict:
      comp = float(wordDict[i] / len(each))
      res.append(round(comp, 4))
    
    valTF.append(res)
  return(valTF)
    
TF = computeTF(norm_sentences)

In [12]:
TF = np.array(TF)
TF = TF.T
type(TF)

numpy.ndarray

In [13]:
df = pd.DataFrame(TF, index=words)
df.sort_index(ascending=True).head(25)

Unnamed: 0,0,1,2,3,4,5,6,7
acclaim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0074
action,0.0088,0.0,0.0,0.0,0.0,0.0,0.0,0.0
advancement,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0074
alduin,0.0,0.0,0.0088,0.0,0.0,0.0,0.0,0.0
allowing,0.0,0.0,0.0,0.0,0.0,0.0072,0.0,0.0
anywhere,0.0,0.0,0.0,0.0,0.0,0.0072,0.0,0.0
around,0.0,0.0,0.0088,0.0,0.0,0.0,0.0,0.0
bethesda,0.0177,0.0,0.0,0.0,0.0,0.0,0.0,0.0
character,0.0,0.0,0.0,0.0,0.0141,0.0,0.0,0.0074
characters,0.0,0.0,0.0088,0.0,0.0,0.0,0.0,0.0


##TF-IDF

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
dt_matrix = tv.fit_transform(norm_sentences)
dt_matrix = dt_matrix.toarray() 
dt_matrix

<class 'scipy.sparse.csr.csr_matrix'>


array([[0.        , 0.26910962, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.53821924, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.26910962,
        0.        , 0.        , 0.        , 0.        , 0.22553486,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.26895395, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.26910962, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.26910962,
        0.22553486, 0.        , 0.        , 0.        , 0.        ,
        0.22553486, 0.26910962, 0.        , 0.  

In [15]:
td_matrix = dt_matrix.T #Transpose Matrix
td_matrix
print(td_matrix.shape)

(86, 8)


In [16]:
vocab = tv.get_feature_names()
len(vocab)



86

In [17]:
pd.DataFrame(np.round(td_matrix, 2), index=vocab)

Unnamed: 0,0,1,2,3,4,5,6,7
acclaim,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.27
action,0.27,0.0,0.00,0.00,0.0,0.00,0.00,0.00
advancement,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.27
alduin,0.00,0.0,0.27,0.00,0.0,0.00,0.00,0.00
allowing,0.00,0.0,0.00,0.00,0.0,0.26,0.00,0.00
...,...,...,...,...,...,...,...,...
unique,0.00,0.0,0.00,0.00,0.0,0.00,0.23,0.00
video,0.23,0.0,0.00,0.00,0.0,0.00,0.00,0.23
world,0.00,0.0,0.20,0.00,0.0,0.19,0.17,0.00
worldeater,0.00,0.0,0.27,0.00,0.0,0.00,0.00,0.00
