# Implement text preprocessing steps such as tokenization, stopword removal, stemming, and lemmatization. Use a subset of the NLTK's movie reviews dataset

In [2]:
import nltk

In [3]:
from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Step 1: Download necessary NLTK resources

In [5]:
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package movie_reviews to C:\Users\Kuldeep
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Kuldeep
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Kuldeep
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Kuldeep
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

nltk.download('movie_reviews') => Dataset

nltk.download('stopwords') => This downloads the Stopwords Corpus from NLTK

nltk.download('punkt') => Punkt is a pre-trained tokenizer that can split text into sentences and words. The tokenizer is language-agnostic and can be used to separate punctuation and handle abbreviations properly.

nltk.download('wordnet') => WordNet is used for tasks like lemmatization and semantic analysis. Lemmatization involves reducing words to their base form (lemma), which requires understanding the context and part of speech of the word. WordNet helps in finding the base forms of words and their meanings.

# Step 2: Load a subset of the movie reviews dataset

In [8]:
#documents = movie_reviews.fileids()
documents = movie_reviews.fileids()[:5] 

In [9]:
documents

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

# Step 3: Initialize tools

In [11]:
stop_words = set(stopwords.words('english'))
stemming = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Step 4: Preprocess each document

In [13]:
for filied in documents:
    review_text = movie_reviews.raw(documents) #Raw review data
    
    #Tokens
    tokens = word_tokenize(review_text)
    print("Tokens: ",tokens,"\n")
    
    #StopWord removal
    stopWord_removal = [word for word in tokens if word.lower() not in stop_words]
    print("Stop words: ",stopWord_removal,"\n")
    
    #Stemming
    word_stemming = [stemming.stem(token) for token in stopWord_removal]
    print("Stemming: ",word_stemming,"\n")
    
    #Lemmatization
    word_lemma = [lemmatizer.lemmatize(token) for token in stopWord_removal]
    print("Lemmatization: ",word_lemma,"\n")
    
    print('-'*50)

Tokens:  ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'s", 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '``', 'sorta', '``', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind-fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad

# First 10 reviews

In [15]:
for felieds in documents:
    review_data = movie_reviews.raw(felieds)
    
    tokens = word_tokenize(review_data)
    print("Tokens: ",tokens[:10],"\n")
    
    filtered_data = [word for word in tokens if word.lower() not in stop_words]
    print("StopWords: ",filtered_data[:10],"\n")
    
    stem_data = [stemming.stem(token) for token in filtered_data]
    print("Stemming: ",stem_data[:10],"\n")
    
    lemma_data = [lemmatizer.lemmatize(token) for token in filtered_data]
    print("lemmatization: ",lemma_data[:10],"\n")
    
    print('-'*100)

Tokens:  ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party'] 

StopWords:  ['plot', ':', 'two', 'teen', 'couples', 'go', 'church', 'party', ',', 'drink'] 

Stemming:  ['plot', ':', 'two', 'teen', 'coupl', 'go', 'church', 'parti', ',', 'drink'] 

lemmatization:  ['plot', ':', 'two', 'teen', 'couple', 'go', 'church', 'party', ',', 'drink'] 

----------------------------------------------------------------------------------------------------
Tokens:  ['the', 'happy', 'bastard', "'s", 'quick', 'movie', 'review', 'damn', 'that', 'y2k'] 

StopWords:  ['happy', 'bastard', "'s", 'quick', 'movie', 'review', 'damn', 'y2k', 'bug', '.'] 

Stemming:  ['happi', 'bastard', "'s", 'quick', 'movi', 'review', 'damn', 'y2k', 'bug', '.'] 

lemmatization:  ['happy', 'bastard', "'s", 'quick', 'movie', 'review', 'damn', 'y2k', 'bug', '.'] 

----------------------------------------------------------------------------------------------------
Tokens:  ['it', 'is', 'movies', 'like', 'these

# Q2: Implement a part-of-speech tagger using the NLTK library and evaluate its performance. Use the Penn Treebank dataset

In [17]:
import nltk
from nltk.corpus import treebank
from nltk.tag import pos_tag
from nltk.classify import accuracy
from nltk.tag import UnigramTagger

In [18]:
nltk.download('treebank')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package treebank to C:\Users\Kuldeep
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Kuldeep
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kuldeep Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [19]:
sentence = treebank.tagged_sents() #loading of dataset

In [20]:
# splitting train and test data
train_len = int(len(sentence)*0.8)
train_sentence = sentence[:train_len]
test_sentence = sentence[train_len:]

In [21]:
# Extracting training and testing data
#train_data = [(nltk.pos_tag(nltk.word_tokenize(' '.join([word for word,_ in sentence]))),[tag for _, tag in sentence]) for sentence in train_sentence]
#test_data = [(nltk.pos_tag(nltk.word_tokenize(' '.join([word for word, _ in sentence]))),[tag for _,tag in sentence])for sentence in test_sentence]

In [22]:
#Training model
tagger = UnigramTagger(train_sentence)

In [23]:
accuracy_Score = tagger.evaluate(test_sentence)

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy_Score = tagger.evaluate(test_sentence)


In [24]:
print(accuracy_Score)

0.8608213982733669


# Q5. Implement a sentiment analysis model using machine learning techniques and evaluate its performance. Use the IMDB movie reviews dataset.

In [26]:
import nltk
from nltk.corpus import brown

In [27]:
nltk.download('brown')

[nltk_data] Downloading package brown to C:\Users\Kuldeep
[nltk_data]     Solanki\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

# Build a text classification model to classify news articles into different categories. Use the 20 Newsgroups dataset. (Libraries like scikit-learn make this task simple, especially with a well-known dataset like 20 Newsgroups.)


In [29]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [39]:
news_data = fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'))

In [41]:
print("Sample text: ",news_data.data[0])
print("Label: ",news_data.target_names[0])

Sample text:  

I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


Label:  alt.atheism


In [43]:
x_train,x_test,y_train,y_test = train_test_split(news_data.data, news_data.target, test_size=0.2,random_state=42)

In [45]:
#Convert text data into TF-IDF vectors
vectorize = TfidfVectorizer(stop_words='english',max_features=5000)
x_train_vec = vectorize.fit_transform(x_train)
x_test_vec = vectorize.transform(x_test)

In [46]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train_vec, y_train)

In [47]:
y_pred_train = model.predict(x_train_vec)
y_pred_test = model.predict(x_test_vec)

In [48]:
accuracy_y_train = accuracy_score(y_train, y_pred_train)
accuracy_y_train

0.8463783496948792

In [49]:
accuracy_y_test = accuracy_score(y_test, y_pred_test)
accuracy_y_test

0.6885941644562334

# Implement a tokenizer that splits a given text into sentences and words. Use a collection of news articles from the Reuters-21578 dataset. (Libraries like NLTK offer tokenizer functionality.)


In [None]:
import nltk
from nltk.corpus import reuters
from nltk.tokenize import sent_tokenize , word_tokenize

In [None]:
nltk.download('reuters')
nltk.download('punkt')

In [None]:
document = reuters.fileids()

In [None]:
sample_article = reuters.raw(document[0])

In [None]:
sample_article

In [None]:
sentences = sent_tokenize(sample_article)
sentences

In [None]:
word_token = [word_tokenize(sentence) for sentence in sentences]
word_token

In [None]:
#applying this on whole dataset
sentence_tokens = []
word_tokens = []
for raw_data in document:
    raw = reuters.raw(document)
    
    sent = sent_tokenize(raw)
    sentence_tokens.extend(sent)
    
    words = [word_tokenize(sentence) for sentence in sent]
    word_tokens.extend(words)

print(sentence_tokens[:10])
print(word_tokens[:10])

# Create a text preprocessing pipeline that includes tokenization, normalization, and vectorization. Use the Amazon Reviews dataset. (Combining several preprocessing steps in a pipeline is a typical practice and can be done easily using libraries like NLTK or scikit-learn.)

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import re
from sklearn.metrics import accuracy_score , classification_report

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def preprocessed_data(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]',' ',text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ''.join(tokens)

In [None]:
data = fetch_20newsgroups(subset='all', remove=('footers','headers','quotes'))

In [None]:
x = data.data
y = data.target

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
pipeline = Pipeline([
    ('vectorize',TfidfVectorizer(stop_words='english',max_features=5000,preprocessor=preprocessed_data)),
    ('classifier',LogisticRegression(max_iter=1000))
])

In [None]:
pipeline.fit(x_train,y_train)

In [None]:
y_pred_train = pipeline.predict(x_train)
y_pred_test = pipeline.predict(x_test)

In [None]:
accuracy_train = accuracy_score(y_train,y_pred_train)
accuracy_train

In [None]:
accuracy_test = accuracy_score(y_test,y_pred_test)
accuracy_test

In [None]:
print(classification_report(y_train, y_pred_train,target_names=data.target_names))