In [1]:
import nltk

In [2]:
my_review = 'The Bharat Jodo Yatra ended ironically with a divisive public rant against “outsiders” doing business in J&K. This lack of understanding of India’s Constitution (there are no outsiders within our borders) and lack of appreciation for the hard-fought recent gains to integrate J&K — India’s Constitution fully applies there only since 2019 — makes us nostalgic for the time when President Giani Zail Singh openly wept at Srinagar in 1982 after Sheikh Abdullah’s death. “I have lost my leader,” he said. Gianiji was referring to Sheikh Sahib’s presidentship of the All India State Peoples’ Conference that agitated against the undemocratic royalty allied with the undemocratic British. Since Sheikh Sahib’s considerable legacy is being diminished by his party let’s take a closer look at his nationalism, pragmatism, and imperfections, as we imagine a Naya Kashmir.'

In [3]:
from nltk.stem import PorterStemmer

In [4]:
from nltk.corpus import stopwords

In [5]:
sentence = nltk.sent_tokenize(my_review)

In [6]:
stemmer = PorterStemmer()

In [7]:
for i in range(len(sentence)):
    words = nltk.word_tokenize(sentence[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentence[i] = ' '.join(words)

In [8]:
sentence

['the bharat jodo yatra end iron divis public rant “ outsid ” busi j & k .',
 'thi lack understand india ’ constitut ( outsid within border ) lack appreci hard-fought recent gain integr j & k — india ’ constitut fulli appli sinc 2019 — make us nostalg time presid giani zail singh openli wept srinagar 1982 sheikh abdullah ’ death .',
 '“ i lost leader , ” said .',
 'gianiji refer sheikh sahib ’ presidentship all india state peopl ’ confer agit undemocrat royalti alli undemocrat british .',
 'sinc sheikh sahib ’ consider legaci diminish parti let ’ take closer look nation , pragmat , imperfect , imagin naya kashmir .']

# Lemmitization

In [9]:
from nltk.stem import WordNetLemmatizer

In [10]:
sentence1 = nltk.sent_tokenize(my_review)
lemmatizer = WordNetLemmatizer()

In [11]:
for i in range(len(sentence1)):
    words = nltk.word_tokenize(sentence1[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set
                                 (stopwords.words('english'))]
    sentence1[i] = ' '.join(words)

In [12]:
sentence1

['The Bharat Jodo Yatra ended ironically divisive public rant “ outsider ” business J & K .',
 'This lack understanding India ’ Constitution ( outsider within border ) lack appreciation hard-fought recent gain integrate J & K — India ’ Constitution fully applies since 2019 — make u nostalgic time President Giani Zail Singh openly wept Srinagar 1982 Sheikh Abdullah ’ death .',
 '“ I lost leader , ” said .',
 'Gianiji referring Sheikh Sahib ’ presidentship All India State Peoples ’ Conference agitated undemocratic royalty allied undemocratic British .',
 'Since Sheikh Sahib ’ considerable legacy diminished party let ’ take closer look nationalism , pragmatism , imperfection , imagine Naya Kashmir .']

# Bag Of Words: 
it is a bag in which all the words of corpus are arranged in vector. For each sentence of the word binary value is assigned. From the assigned value    

In [13]:
import re 

In [16]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
sentence = nltk.sent_tokenize(my_review)
corpus = []

for i in range(len(sentence)):
    review = re.sub('[^a-zA-Z]', ' ', sentence[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set
             (stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    

In [17]:
corpus

['bharat jodo yatra end iron divis public rant outsid busi j k',
 'lack understand india constitut outsid within border lack appreci hard fought recent gain integr j k india constitut fulli appli sinc make us nostalg time presid giani zail singh openli wept srinagar sheikh abdullah death',
 'lost leader said',
 'gianiji refer sheikh sahib presidentship india state peopl confer agit undemocrat royalti alli undemocrat british',
 'sinc sheikh sahib consider legaci diminish parti let take closer look nation pragmat imperfect imagin naya kashmir']

# Model Building 

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)
x = cv.fit_transform(corpus).toarray()

In [22]:
x

array([[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0],
       [1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 1, 0,
        1, 0, 0, 2, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
        0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,