---
#**EXTRACTION METHOD**
---

# Install necessary dependencies

In [1]:
import nltk
import numpy as np
import pandas as pd
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Get Text Document

We use the description of a very popular role-playing game (RPG) Skyrim from
Bethesda Softworks for summarization. 

In [2]:
DOCUMENT = """
The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks.

It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.
"""

In [3]:
import re
DOCUMENT = re.sub(r'\n|\r', ' ', DOCUMENT) #Combining all the paragraphs
DOCUMENT = re.sub(r' +', ' ', DOCUMENT)
DOCUMENT = DOCUMENT.strip()

In [4]:
print(DOCUMENT)

The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks. It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.


Sentences Collection

In [5]:
sentences = nltk.sent_tokenize(DOCUMENT)
len(sentences)

2

In [6]:
sentences

['The Elder Scrolls V: Skyrim is an action role-playing video game developed by Bethesda Game Studios and published by Bethesda Softworks.',
 'It is the fifth main installment in The Elder Scrolls series, following The Elder Scrolls IV: Oblivion.']

# Basic Text pre-processing

In [7]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document) #Function Def Vectorize
norm_sentences = normalize_corpus(sentences)
norm_sentences[:3]

array(['elder scrolls v skyrim action roleplaying video game developed bethesda game studios published bethesda softworks',
       'fifth main installment elder scrolls series following elder scrolls iv oblivion'],
      dtype='<U113')

# *I - UNIGRAM*

##TF

In [8]:
print("Norm Sentences:", norm_sentences)
print("Len of Norm Sentences:", len(norm_sentences))

Norm Sentences: ['elder scrolls v skyrim action roleplaying video game developed bethesda game studios published bethesda softworks'
 'fifth main installment elder scrolls series following elder scrolls iv oblivion']
Len of Norm Sentences: 2


In [10]:
# Unique words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range =(1, 1))
X1 = vectorizer.fit_transform(norm_sentences)
words = (vectorizer.get_feature_names())
print(words)

['action', 'bethesda', 'developed', 'elder', 'fifth', 'following', 'game', 'installment', 'iv', 'main', 'oblivion', 'published', 'roleplaying', 'scrolls', 'series', 'skyrim', 'softworks', 'studios', 'video']




In [11]:
def computeTF(doc):
  valTF = []
  for each in doc:
    wordDict = dict.fromkeys(words, 0)
    sentence = each.split(" ")

    for word in sentence:
      wordDict[word]+=1

    res = []
    for i in wordDict:
      comp = float(wordDict[i] / len(each))
      res.append(round(comp, 4))
    
    valTF.append(res)
  return(valTF)
    
TF = computeTF(norm_sentences)

KeyError: ignored

In [None]:
TF = np.array(TF)
TF = TF.T
type(TF)

In [None]:
df = pd.DataFrame(TF, index=words)
df.sort_index(ascending=True).head(10)

##TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
dt_matrix = tv.fit_transform(norm_sentences)
dt_matrix = dt_matrix.toarray() 

In [None]:
td_matrix = dt_matrix.T #Transpose Matrix
td_matrix
print(td_matrix.shape)

In [None]:
vocab = tv.get_feature_names()
len(vocab)

In [None]:
pd.DataFrame(np.round(td_matrix, 2), index=vocab)

# *II - BIGRAM* 

In [None]:
# Forming Bigrams
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range = (2,2))
X1 = vectorizer.fit_transform(norm_sentences) 
features = (vectorizer.get_feature_names())
print("\n\nFeatures : \n", features)
print("\n\nX1 : \n", X1.toarray())

## TF

## TF-IDF

In [None]:
# Applying TFIDF
vectorizer = TfidfVectorizer(ngram_range = (2, 2))
X2 = vectorizer.fit_transform(norm_sentences)
scores = (X2.toarray())
print("\n\nScores : \n", scores)