# ESE Test - NLP
# Jerin Mathew
# Roll No: 2139455

In [6]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [3]:
data = open("C:/ClassesMSC/DataSets/ESE-NLP.txt").read()
data

'"In 1940, Ferrari produced a race car - the Tipo 815, based on a Fiat platform. It was the first Ferrari car and debuted at the 1940 Mille, but due to World War Il it saw little competition. In 1943, the Ferrari factory moved to Maranello, where it has remained ever since. The early mid-engined Ferrari. Ferrari cars typically featured bodywork designed and customised by independent coachbuilders such as Pininfarina, Scaglietti, Zagato, Vignale and Bertone"'

In [4]:
# Sentence Tokens
sentence_tokens = nltk.sent_tokenize(data)
sentence_tokens

['"In 1940, Ferrari produced a race car - the Tipo 815, based on a Fiat platform.',
 'It was the first Ferrari car and debuted at the 1940 Mille, but due to World War Il it saw little competition.',
 'In 1943, the Ferrari factory moved to Maranello, where it has remained ever since.',
 'The early mid-engined Ferrari.',
 'Ferrari cars typically featured bodywork designed and customised by independent coachbuilders such as Pininfarina, Scaglietti, Zagato, Vignale and Bertone"']

In [5]:
# Word Tokens
word_tokens = nltk.word_tokenize(data)
word_tokens

['``',
 'In',
 '1940',
 ',',
 'Ferrari',
 'produced',
 'a',
 'race',
 'car',
 '-',
 'the',
 'Tipo',
 '815',
 ',',
 'based',
 'on',
 'a',
 'Fiat',
 'platform',
 '.',
 'It',
 'was',
 'the',
 'first',
 'Ferrari',
 'car',
 'and',
 'debuted',
 'at',
 'the',
 '1940',
 'Mille',
 ',',
 'but',
 'due',
 'to',
 'World',
 'War',
 'Il',
 'it',
 'saw',
 'little',
 'competition',
 '.',
 'In',
 '1943',
 ',',
 'the',
 'Ferrari',
 'factory',
 'moved',
 'to',
 'Maranello',
 ',',
 'where',
 'it',
 'has',
 'remained',
 'ever',
 'since',
 '.',
 'The',
 'early',
 'mid-engined',
 'Ferrari',
 '.',
 'Ferrari',
 'cars',
 'typically',
 'featured',
 'bodywork',
 'designed',
 'and',
 'customised',
 'by',
 'independent',
 'coachbuilders',
 'such',
 'as',
 'Pininfarina',
 ',',
 'Scaglietti',
 ',',
 'Zagato',
 ',',
 'Vignale',
 'and',
 'Bertone',
 "''"]

In [8]:
# Stop Word removal
stop_words = set(stopwords.words('english'))


filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)

['``', 'In', '1940', ',', 'Ferrari', 'produced', 'race', 'car', '-', 'Tipo', '815', ',', 'based', 'Fiat', 'platform', '.', 'It', 'first', 'Ferrari', 'car', 'debuted', '1940', 'Mille', ',', 'due', 'World', 'War', 'Il', 'saw', 'little', 'competition', '.', 'In', '1943', ',', 'Ferrari', 'factory', 'moved', 'Maranello', ',', 'remained', 'ever', 'since', '.', 'The', 'early', 'mid-engined', 'Ferrari', '.', 'Ferrari', 'cars', 'typically', 'featured', 'bodywork', 'designed', 'customised', 'independent', 'coachbuilders', 'Pininfarina', ',', 'Scaglietti', ',', 'Zagato', ',', 'Vignale', 'Bertone', "''"]


In [13]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print("coachbuilders :", lemmatizer.lemmatize("coachbuilders"))
print("typically :", lemmatizer.lemmatize("typically"))
print("cars :", lemmatizer.lemmatize("cars"))

coachbuilders : coachbuilder
typically : typically
cars : car


In [15]:
# Stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
words = ["featured", "remained", "coachbuilders"]
  
for w in words:
    print(w, " : ", stemmer.stem(w))

featured  :  featur
remained  :  remain
coachbuilders  :  coachbuild


In [16]:
# Removing stop words then vectorizing

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([data])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()



Unnamed: 0,1940,1943,815,based,bertone,bodywork,car,cars,coachbuilders,competition,...,race,remained,saw,scaglietti,tipo,typically,vignale,war,world,zagato
0,2,1,1,1,1,1,2,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [18]:
# Removing stop words then vectorizing for bigram

vectorizer = CountVectorizer(stop_words='english',ngram_range=(2,2))
X = vectorizer.fit_transform([data])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()



Unnamed: 0,1940 ferrari,1940 mille,1943 ferrari,815 based,based fiat,bodywork designed,car debuted,car tipo,cars typically,coachbuilders pininfarina,...,race car,remained early,saw little,scaglietti zagato,tipo 815,typically featured,vignale bertone,war il,world war,zagato vignale
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [20]:
# Vectorization (Bag of words)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([data])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn.head()



Unnamed: 0,1940,1943,815,and,as,at,based,bertone,bodywork,but,...,the,tipo,to,typically,vignale,war,was,where,world,zagato
0,2,1,1,3,1,1,1,1,1,1,...,5,1,2,1,1,1,1,1,1,1


In [22]:
# N-grams
from nltk.util import ngrams
NGRAMS=ngrams(sequence=nltk.word_tokenize(data), n=1) 
for grams in NGRAMS:
    print(grams)

('``',)
('In',)
('1940',)
(',',)
('Ferrari',)
('produced',)
('a',)
('race',)
('car',)
('-',)
('the',)
('Tipo',)
('815',)
(',',)
('based',)
('on',)
('a',)
('Fiat',)
('platform',)
('.',)
('It',)
('was',)
('the',)
('first',)
('Ferrari',)
('car',)
('and',)
('debuted',)
('at',)
('the',)
('1940',)
('Mille',)
(',',)
('but',)
('due',)
('to',)
('World',)
('War',)
('Il',)
('it',)
('saw',)
('little',)
('competition',)
('.',)
('In',)
('1943',)
(',',)
('the',)
('Ferrari',)
('factory',)
('moved',)
('to',)
('Maranello',)
(',',)
('where',)
('it',)
('has',)
('remained',)
('ever',)
('since',)
('.',)
('The',)
('early',)
('mid-engined',)
('Ferrari',)
('.',)
('Ferrari',)
('cars',)
('typically',)
('featured',)
('bodywork',)
('designed',)
('and',)
('customised',)
('by',)
('independent',)
('coachbuilders',)
('such',)
('as',)
('Pininfarina',)
(',',)
('Scaglietti',)
(',',)
('Zagato',)
(',',)
('Vignale',)
('and',)
('Bertone',)
("''",)


In [24]:
# N-grams
from nltk.util import ngrams
NGRAMS=ngrams(sequence=nltk.word_tokenize(data), n=2)
for grams in NGRAMS:
    print(grams)

('``', 'In')
('In', '1940')
('1940', ',')
(',', 'Ferrari')
('Ferrari', 'produced')
('produced', 'a')
('a', 'race')
('race', 'car')
('car', '-')
('-', 'the')
('the', 'Tipo')
('Tipo', '815')
('815', ',')
(',', 'based')
('based', 'on')
('on', 'a')
('a', 'Fiat')
('Fiat', 'platform')
('platform', '.')
('.', 'It')
('It', 'was')
('was', 'the')
('the', 'first')
('first', 'Ferrari')
('Ferrari', 'car')
('car', 'and')
('and', 'debuted')
('debuted', 'at')
('at', 'the')
('the', '1940')
('1940', 'Mille')
('Mille', ',')
(',', 'but')
('but', 'due')
('due', 'to')
('to', 'World')
('World', 'War')
('War', 'Il')
('Il', 'it')
('it', 'saw')
('saw', 'little')
('little', 'competition')
('competition', '.')
('.', 'In')
('In', '1943')
('1943', ',')
(',', 'the')
('the', 'Ferrari')
('Ferrari', 'factory')
('factory', 'moved')
('moved', 'to')
('to', 'Maranello')
('Maranello', ',')
(',', 'where')
('where', 'it')
('it', 'has')
('has', 'remained')
('remained', 'ever')
('ever', 'since')
('since', '.')
('.', 'The')
('Th

In [25]:
# N-grams
from nltk.util import ngrams
NGRAMS=ngrams(sequence=nltk.word_tokenize(data), n=3)
for grams in NGRAMS:
    print(grams)

('``', 'In', '1940')
('In', '1940', ',')
('1940', ',', 'Ferrari')
(',', 'Ferrari', 'produced')
('Ferrari', 'produced', 'a')
('produced', 'a', 'race')
('a', 'race', 'car')
('race', 'car', '-')
('car', '-', 'the')
('-', 'the', 'Tipo')
('the', 'Tipo', '815')
('Tipo', '815', ',')
('815', ',', 'based')
(',', 'based', 'on')
('based', 'on', 'a')
('on', 'a', 'Fiat')
('a', 'Fiat', 'platform')
('Fiat', 'platform', '.')
('platform', '.', 'It')
('.', 'It', 'was')
('It', 'was', 'the')
('was', 'the', 'first')
('the', 'first', 'Ferrari')
('first', 'Ferrari', 'car')
('Ferrari', 'car', 'and')
('car', 'and', 'debuted')
('and', 'debuted', 'at')
('debuted', 'at', 'the')
('at', 'the', '1940')
('the', '1940', 'Mille')
('1940', 'Mille', ',')
('Mille', ',', 'but')
(',', 'but', 'due')
('but', 'due', 'to')
('due', 'to', 'World')
('to', 'World', 'War')
('World', 'War', 'Il')
('War', 'Il', 'it')
('Il', 'it', 'saw')
('it', 'saw', 'little')
('saw', 'little', 'competition')
('little', 'competition', '.')
('competiti

In [26]:
# N-grams
from nltk.util import ngrams
NGRAMS=ngrams(sequence=nltk.word_tokenize(data), n=4)
for grams in NGRAMS:
    print(grams)

('``', 'In', '1940', ',')
('In', '1940', ',', 'Ferrari')
('1940', ',', 'Ferrari', 'produced')
(',', 'Ferrari', 'produced', 'a')
('Ferrari', 'produced', 'a', 'race')
('produced', 'a', 'race', 'car')
('a', 'race', 'car', '-')
('race', 'car', '-', 'the')
('car', '-', 'the', 'Tipo')
('-', 'the', 'Tipo', '815')
('the', 'Tipo', '815', ',')
('Tipo', '815', ',', 'based')
('815', ',', 'based', 'on')
(',', 'based', 'on', 'a')
('based', 'on', 'a', 'Fiat')
('on', 'a', 'Fiat', 'platform')
('a', 'Fiat', 'platform', '.')
('Fiat', 'platform', '.', 'It')
('platform', '.', 'It', 'was')
('.', 'It', 'was', 'the')
('It', 'was', 'the', 'first')
('was', 'the', 'first', 'Ferrari')
('the', 'first', 'Ferrari', 'car')
('first', 'Ferrari', 'car', 'and')
('Ferrari', 'car', 'and', 'debuted')
('car', 'and', 'debuted', 'at')
('and', 'debuted', 'at', 'the')
('debuted', 'at', 'the', '1940')
('at', 'the', '1940', 'Mille')
('the', '1940', 'Mille', ',')
('1940', 'Mille', ',', 'but')
('Mille', ',', 'but', 'due')
(',', 'but

In [29]:
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))
sentence_tokens = sent_tokenize(data)
for x in sentence_tokens:
    print(x, "\n")

"In 1940, Ferrari produced a race car - the Tipo 815, based on a Fiat platform. 

It was the first Ferrari car and debuted at the 1940 Mille, but due to World War Il it saw little competition. 

In 1943, the Ferrari factory moved to Maranello, where it has remained ever since. 

The early mid-engined Ferrari. 

Ferrari cars typically featured bodywork designed and customised by independent coachbuilders such as Pininfarina, Scaglietti, Zagato, Vignale and Bertone" 



In [30]:
nltk.download('averaged_perceptron_tagger')
for x in sentence_tokens:
    word_list = word_tokenize(x)
    words = [i for i in word_list if i not in stop_words]
    pos_tag = nltk.pos_tag(words)
    print()
    print(words, "\n")
    print()
    print(pos_tag) 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JERIN\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



['``', 'In', '1940', ',', 'Ferrari', 'produced', 'race', 'car', '-', 'Tipo', '815', ',', 'based', 'Fiat', 'platform', '.'] 


[('``', '``'), ('In', 'IN'), ('1940', 'CD'), (',', ','), ('Ferrari', 'NNP'), ('produced', 'VBD'), ('race', 'NN'), ('car', 'NN'), ('-', ':'), ('Tipo', 'NN'), ('815', 'CD'), (',', ','), ('based', 'VBN'), ('Fiat', 'NNP'), ('platform', 'NN'), ('.', '.')]

['It', 'first', 'Ferrari', 'car', 'debuted', '1940', 'Mille', ',', 'due', 'World', 'War', 'Il', 'saw', 'little', 'competition', '.'] 


[('It', 'PRP'), ('first', 'RB'), ('Ferrari', 'NNP'), ('car', 'NN'), ('debuted', 'VBD'), ('1940', 'CD'), ('Mille', 'NNP'), (',', ','), ('due', 'JJ'), ('World', 'NNP'), ('War', 'NNP'), ('Il', 'NNP'), ('saw', 'VBD'), ('little', 'JJ'), ('competition', 'NN'), ('.', '.')]

['In', '1943', ',', 'Ferrari', 'factory', 'moved', 'Maranello', ',', 'remained', 'ever', 'since', '.'] 


[('In', 'IN'), ('1943', 'CD'), (',', ','), ('Ferrari', 'NNP'), ('factory', 'NN'), ('moved', 'VBD'), ('Maranello