### STEPS:

#### Pipeline - 1
1. Tokenization
1. Remove StopWords and Punctuation
1. Stemming

#### Pipeline - 2
1. Tokenization
1. POS Tagger
1. Lemmatization

***Remember to Deal With Everything in Lower Cases***

In [1]:
import nltk
nltk.download('punkt')           # For Tokenizing
nltk.download('stopwords')       # For Stopwords
nltk.download('wordnet')         # For Lemmatization
nltk.download('averaged_perceptron_tagger') # For POS Tagging
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import EnglishStemmer
from nltk import pos_tag         # POS Tagger
from nltk.corpus import wordnet as wn
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kapilkalra04\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kapilkalra04\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kapilkalra04\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kapilkalra04\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
input = [
    'Industrial Disease',
'Private Investigations',
'So Far Away',
'Twisting by the Pool',
'Skateaway',
'Walk of Life',
'Romeo and Juliet',
'Tunnel of Love',
'Money for Nothing',
'Sultans of Swing',
'Stairway To Heaven',
'Kashmir',
'Achilles Last Stand',
'Whole Lotta Love',
'Immigrant Song',
'Black Dog',
'When The Levee Breaks',
'Since I\'ve Been Lovin\' You',
'Since I\'ve Been Loving You',
'Over the Hills and Far Away',
'Dazed and Confused' 
]

In [3]:
len(input)

21

#### TOKENIZER

In [4]:
def tokenize(sentence):
     return nltk.tokenize.word_tokenize(sentence)

In [5]:
print input[17]
print tokenize(input[17])             # there a "(\')" being tokenized we will remove it later

Since I've Been Lovin' You
['Since', 'I', "'ve", 'Been', 'Lovin', "'", 'You']


#### POS TAGGER CONVERTER

In [6]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank (pos_tag) tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

#### LEMMATIZER

In [7]:
wnl = WordNetLemmatizer()

In [8]:
def lemmatize(word, pos=wn.NOUN):
    return wnl.lemmatize(word,pos=pos)

In [9]:
print input[3]
print tokenize(input[3])
for w in tokenize(input[3]):
    w.lower()
    print lemmatize(w)

Twisting by the Pool
['Twisting', 'by', 'the', 'Pool']
Twisting
by
the
Pool


#### STEMMER

In [10]:
def stem(word):
    stemmer = EnglishStemmer()
    return stemmer.stem(word)

In [11]:
print input[3]
print tokenize(input[3])
for w in tokenize(input[3]):
    w.lower()
    print stem(w)

Twisting by the Pool
['Twisting', 'by', 'the', 'Pool']
twist
by
the
pool


#### STOPWORDS

In [12]:
stopwords = nltk.corpus.stopwords.words('English')
print stopwords

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll", u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u"she's", u'her', u'hers', u'herself', u'it', u"it's", u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u"that'll", u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'eac

#### INDEXED DATABASES

#### Pipeline 1

In [13]:
db = {}

In [14]:
for sentence in input:
    words = tokenize(sentence)
    for word in words:
        word = word.lower()
        if word not in stopwords and word not in string.punctuation:
            root = stem(word)
            if db.has_key(root):
                db[root].append(sentence)
            else :
                db[root] = [sentence]

In [15]:
db

{u'achill': ['Achilles Last Stand'],
 u'away': ['So Far Away', 'Over the Hills and Far Away'],
 u'black': ['Black Dog'],
 u'break': ['When The Levee Breaks'],
 u'confus': ['Dazed and Confused'],
 u'daze': ['Dazed and Confused'],
 u'diseas': ['Industrial Disease'],
 u'dog': ['Black Dog'],
 u'far': ['So Far Away', 'Over the Hills and Far Away'],
 u'heaven': ['Stairway To Heaven'],
 u'hill': ['Over the Hills and Far Away'],
 u'immigr': ['Immigrant Song'],
 u'industri': ['Industrial Disease'],
 u'investig': ['Private Investigations'],
 u'juliet': ['Romeo and Juliet'],
 u'kashmir': ['Kashmir'],
 u'last': ['Achilles Last Stand'],
 u'leve': ['When The Levee Breaks'],
 u'life': ['Walk of Life'],
 u'lotta': ['Whole Lotta Love'],
 u'love': ['Tunnel of Love', 'Whole Lotta Love', "Since I've Been Loving You"],
 u'lovin': ["Since I've Been Lovin' You"],
 u'money': ['Money for Nothing'],
 u'noth': ['Money for Nothing'],
 u'pool': ['Twisting by the Pool'],
 u'privat': ['Private Investigations'],
 u'r

#### Pipeline 2

In [16]:
db2 = {}

In [17]:
for sentence in input:
    words = tokenize(sentence)
    tagged_sentence = pos_tag(words)
    for word, tag in tagged_sentence:
        word = word.lower()
        tag = penn_to_wn(tag)
        if tag in (wn.NOUN,wn.ADJ,wn.VERB,wn.ADV):
            root = lemmatize(word,tag)
            if db2.has_key(root):
                db2[root].append(sentence)
            else :
                db2[root] = [sentence]

In [18]:
db2

{"'ve": ["Since I've Been Lovin' You", "Since I've Been Loving You"],
 'achilles': ['Achilles Last Stand'],
 'away': ['So Far Away', 'Over the Hills and Far Away'],
 'been': ["Since I've Been Lovin' You", "Since I've Been Loving You"],
 'black': ['Black Dog'],
 u'break': ['When The Levee Breaks'],
 u'confuse': ['Dazed and Confused'],
 u'daze': ['Dazed and Confused'],
 'disease': ['Industrial Disease'],
 'dog': ['Black Dog'],
 'far': ['So Far Away', 'Over the Hills and Far Away'],
 'heaven': ['Stairway To Heaven'],
 u'hill': ['Over the Hills and Far Away'],
 'immigrant': ['Immigrant Song'],
 'industrial': ['Industrial Disease'],
 u'investigation': ['Private Investigations'],
 'juliet': ['Romeo and Juliet'],
 'kashmir': ['Kashmir'],
 'last': ['Achilles Last Stand'],
 'levee': ['When The Levee Breaks'],
 'life': ['Walk of Life'],
 'lotta': ['Whole Lotta Love'],
 'love': ['Tunnel of Love', 'Whole Lotta Love'],
 'lovin': ["Since I've Been Lovin' You"],
 'loving': ["Since I've Been Loving Yo

#### WE CAN OBSERVE THAT BY USING LEMMATIZATION WE PRESERVED THE MORPHOLOGY