<a href="https://colab.research.google.com/github/jenbam/python_nltk_data_preprocessing/blob/main/Sent_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
X_train = ["This was awesome an awesome movie",
      "Great movie! I liked it a lot",
	  "Happy Ending! awesome acting by the hero",
	  "loved it! truly great",
	  "bad not upto the mark",
	  "could have been better",
	  "Surely a Disappointing movie"
]

X_test = ["I was happy & happy and i loved the acting in the movie",
          "The movie I saw was bad"]


y_train = [1,1,1,1,0,0,0] # 1 -Positive, 0 - Negative Class

In [74]:
X_train

['This was awesome an awesome movie',
 'Great movie! I liked it a lot',
 'Happy Ending! awesome acting by the hero',
 'loved it! truly great',
 'bad not upto the mark',
 'could have been better',
 'Surely a Disappointing movie']

Let's clean our data


*   Tokenization
*   Stemming
*   Stop word removal

In [75]:
from nltk.tokenize import RegexpTokenizer

In [76]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [77]:
# import stop words
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
# Lets create the objects of our classes
tokenizer = RegexpTokenizer(r'\w+')    # i pass the pattern i want, e.g word and also concatenate words
# I am using english, so..
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [79]:
def getCleanedText(text):
  text = text.lower()

  #tokenize & stopword removal
  tokens = tokenizer.tokenize(text)
  new_tokens = [token for token in tokens if token not in en_stopwords]

  # stemming
  stemmed_tokens = [ps.stem(token) for token in new_tokens]

  clean_text = " ".join(stemmed_tokens)

  return clean_text

In [80]:
X_clean = [getCleanedText(i) for i in X_train]
Xt_clean = [getCleanedText(i) for i in X_test]

In [81]:
X_clean 

['awesom awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad upto mark',
 'could better',
 'sure disappoint movi']

# Vectorization 

In [82]:
from sklearn.feature_extraction.text import CountVectorizer


In [83]:
cv = CountVectorizer(ngram_range=(1,2))

In [84]:
X_vec = cv.fit_transform(X_clean).toarray()

In [85]:
X_vec 


array([[0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0]])

In [86]:
print(cv.get_feature_names_out())   # Bag of Word Model

['act' 'act hero' 'awesom' 'awesom act' 'awesom awesom' 'awesom movi'
 'bad' 'bad upto' 'better' 'could' 'could better' 'disappoint'
 'disappoint movi' 'end' 'end awesom' 'great' 'great movi' 'happi'
 'happi end' 'hero' 'like' 'like lot' 'lot' 'love' 'love truli' 'mark'
 'movi' 'movi like' 'sure' 'sure disappoint' 'truli' 'truli great' 'upto'
 'upto mark']


In [87]:
Xt_vec = cv.transform(Xt_clean).toarray()

### Classification - using Multinominal Naive Bayes





In [88]:
## Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

In [89]:
mn = MultinomialNB()

In [93]:
mn.fit(X_vec, y_train)

MultinomialNB()

In [94]:
y_pred = mn.predict(Xt_vec)

In [95]:
y_pred 

array([1, 0])