1. Pre-processing using NLTK 

In [11]:
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer, PorterStemmer 
import string 
# Download necessary NLTK resources 
nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('wordnet') 
def preprocess_text(text): 
# Lower casing 
    text = text.lower() 
# Tokenization 
    tokens = nltk.word_tokenize(text) 
# Remove punctuation 
    tokens = [word for word in tokens if word.isalnum()] 
# Remove stop words 
    tokens = [word for word in tokens if word not in stopwords.words('english')] 
# Lemmatization 
    lemmatizer = WordNetLemmatizer() 
    tokens = [lemmatizer.lemmatize(word) for word in tokens] 
# Stemming 
    stemmer = PorterStemmer() 
    tokens = [stemmer.stem(word) for word in tokens] 
    return tokens 
# Example usage 
text = "NLTK is a leading platform for building Python programs to work with human language data." 
processed_text = preprocess_text(text) 
print(processed_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['nltk', 'lead', 'platform', 'build', 'python', 'program', 'work', 'human', 'languag', 'data']


2. N-grams Generation 

In [31]:
from nltk import ngrams 
def generate_ngrams(text, n): 
    tokens = nltk.word_tokenize(text) 
    return list(ngrams(tokens, n)) 
# Example usage 
text = "I love natural language processing"
bigrams = generate_ngrams(text, 2) 
print(bigrams)

[('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing')]


3. Synonyms and Antonyms Identification 

In [36]:
from nltk.corpus import wordnet 
 
nltk.download('wordnet') 
 
def get_synonyms_antonyms(word): 
    synonyms = [] 
    antonyms = [] 
    for syn in wordnet.synsets(word): 
        for lemma in syn.lemmas(): 
            synonyms.append(lemma.name()) 
            if lemma.antonyms(): 
                antonyms.append(lemma.antonyms()[0].name()) 
    return set(synonyms), set(antonyms) 
 
# Example usage 
synonyms, antonyms = get_synonyms_antonyms("happy") 
print("Synonyms:", synonyms) 
print("Antonyms:", antonyms)

Synonyms: {'felicitous', 'glad', 'happy', 'well-chosen'}
Antonyms: {'unhappy'}


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


4. TF-IDF Implementation

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer 

# Sample corpus 
corpus = [ 
    "This is the first document.", 
    "This document is the second document.", 
    "And this is the third one.", 
    "Is this the first document?" 
] 

def compute_tfidf(corpus): 
    vectorizer = TfidfVectorizer()  # Corrected capitalization
    tfidf_matrix = vectorizer.fit_transform(corpus) 
    return tfidf_matrix, vectorizer.get_feature_names_out() 

# Example usage 
tfidf_matrix, feature_names = compute_tfidf(corpus) 
print("TF-IDF Matrix:\n", tfidf_matrix.toarray()) 
print("Feature Names:", feature_names)

TF-IDF Matrix:
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
Feature Names: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


5. Part-of-Speech (PoS) Tagging

In [64]:
import nltk

# Download the required NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')  # Also download the punkt tokenizer if not already downloaded

def pos_tagging(text): 
    tokens = nltk.word_tokenize(text) 
    return nltk.pos_tag(tokens)

# Example usage 
text = "Python is an amazing programming language." 
pos_tags = pos_tagging(text) 
print(pos_tags)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...


[('Python', 'NNP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('programming', 'NN'), ('language', 'NN'), ('.', '.')]


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


6. Named Entity Recognition (NER) 

In [71]:
import nltk

# Download the required NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def named_entity_recognition(text): 
    tokens = nltk.word_tokenize(text)  
    tagged = nltk.pos_tag(tokens)       
    return nltk.ne_chunk(tagged)        

# Example usage 
text = "Barack Obama was the 44th President of the United States."  
ner_tree = named_entity_recognition(text) 
print(ner_tree)  

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  the/DT
  44th/JJ
  President/NNP
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Jaskaran\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


7. Sentiment Analysis 

In [78]:

from textblob import TextBlob 

def sentiment_analysis(text): 
    analysis = TextBlob(text) 
    return analysis.sentiment.polarity  # Returns a value between -1 and 1 

# Example usage 
text = "I love programming in Python!" 
sentiment_score = sentiment_analysis(text) 
print("Sentiment Score:", sentiment_score)

Sentiment Score: 0.625


8. Spam Filter Development

In [81]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import make_pipeline 
from sklearn.model_selection import train_test_split 

# Sample dataset 
data = [  # Changed 'Data' to 'data' to follow naming conventions
    ("Free money now!!!", 1), 
    ("Hi, how are you?", 0), 
    ("Get paid to work from home.", 1), 
    ("Hello, I wanted to check in.", 0), 
] 

# Split data into features and labels 
X, y = zip(*data) 

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) 

def train_spam_filter(X_train, y_train):  # Changed 'Def' to 'def'
    model = make_pipeline(CountVectorizer(), MultinomialNB())  # Changed 'Model' to 'model'
    model.fit(X_train, y_train) 
    return model  # Changed 'Return' to 'return'

# Train the model 
spam_model = train_spam_filter(X_train, y_train)  # Changed 'Spam_model' to 'spam_model'

# Example usage 
test_message = ["Congratulations! You’ve won a free ticket!", "Can we meet tomorrow?"]  # Changed quotes
predictions = spam_model.predict(test_message)  # Changed 'Predictions' to 'predictions'
print("Predictions (1: Spam, 0: Not Spam):", predictions)  # Changed 'Print' to 'print'

Predictions (1: Spam, 0: Not Spam): [1 1]


9. Fake News Detection 

In [84]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression 
from sklearn.pipeline import make_pipeline 

# Sample dataset (you can replace this with a real dataset) 
data = { 
    'text': [ 
        "Breaking: New study shows that eating chocolate can help you lose weight.", 
        "Local man wins lottery and donates to charity.", 
        "Scientists discover a new planet that could support life.", 
        "New study reveals that drinking coffee can lead to heart disease.", 
        "The moon landing was staged." 
    ], 
    'label': [1, 0, 1, 0, 0]  # 1: Fake, 0: Real 
} 

df = pd.DataFrame(data) 

# Split data into features and labels 
X = df['text'] 
y = df['label'] 

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) 

def train_fake_news_detector(X_train, y_train): 
    model = make_pipeline(TfidfVectorizer(), LogisticRegression())  # Indented correctly
    model.fit(X_train, y_train) 
    return model 

# Train the model 
fake_news_model = train_fake_news_detector(X_train, y_train) 

# Example usage 
test_articles = [ 
    "New evidence suggests that climate change is a hoax.", 
    "Local community comes together to support homeless shelter." 
] 

predictions = fake_news_model.predict(test_articles) 
print("Predictions (1: Fake News, 0: Real News):", predictions)

Predictions (1: Fake News, 0: Real News): [0 0]
