***1.Python program that demonstrate how to use regular expressions to validate and extract email addresses from a given text.***

In [None]:
import re

def extract_emails(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    matches = re.findall(email_pattern, text)

    return matches

def validate_email(email):
    email_pattern = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}$'
    if re.match(email_pattern, email):
        return True
    else:
        return False

input_text = "Please contact support@example.com for assistance or info@company.com for more information."

email_addresses = extract_emails(input_text)
print("Extracted email addresses:", email_addresses)

for email in email_addresses:
    is_valid = validate_email(email)
    print(f"Email: {email}, Valid: {is_valid}")


Extracted email addresses: ['support@example.com', 'info@company.com']
Email: support@example.com, Valid: True
Email: info@company.com, Valid: True


***2.Implement a python program that defines a finite state automation to recognize strings with an equal number of 0's and 1's***

In [None]:
def is_even(num):
    return num % 2 == 0

def finite_state_machine(input_string):
    states = {
        'q0': {'0': 'q1', '1': 'q2'},
        'q1': {'0': 'q3', '1': 'q0'},
        'q2': {'0': 'q0', '1': 'q3'},
        'q3': {'0': 'q2', '1': 'q1'}
    }

    current_state = 'q0'

    for symbol in input_string:
        if symbol not in states[current_state]:
            return False
        current_state = states[current_state][symbol]


    return is_even(input_string.count('0')) and is_even(input_string.count('1'))

input_str1 = "0011"  # Accepted
input_str2 = "10101"  # Accepted
input_str3 = "110"    # Not accepted

print(f"{input_str1}: {finite_state_machine(input_str1)}")
print(f"{input_str2}: {finite_state_machine(input_str2)}")
print(f"{input_str3}: {finite_state_machine(input_str3)}")


0011: True
10101: False
110: False


***3.Write a python program that uses the NLTK library to perform morphological analysis on sentence"Unhappily ,she ran quickly".***

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def morphological_analysis(sentence):

    words = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(words)

    return pos_tags
sentence = "Unhappily she ran quickly"

analysis_result = morphological_analysis(sentence)

print("Morphological Analysis:")
for word, pos_tag in analysis_result:
    print(f"{word}: {pos_tag}")


Morphological Analysis:
Unhappily: RB
she: PRP
ran: VBD
quickly: RB


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


***4.Implement a python program that creates a finite-state machine for parsing and generating the past tense forms of english verbs using the sentences"She walked to the park yesterday","He jumped over the fence".***

In [None]:
class VerbStateMachine:
    def __init__(self):
        self.state = 'base'
        self.past_tense = {
            'walk': 'walked',
            'jump': 'jumped'
        }

    def generate_past_tense(self, verb):
        return self.past_tense.get(verb, f"{verb}ed")

    def parse_sentence(self, sentence):
        words = sentence.split()
        parsed_sentence = []

        for word in words:
            if self.state == 'base':
                if word.lower() in self.past_tense:
                    parsed_sentence.append(self.generate_past_tense(word.lower()))
                else:
                    parsed_sentence.append(word)
            else:
                parsed_sentence.append(word)

        return ' '.join(parsed_sentence)

verb_fsm = VerbStateMachine()

sentence1 = "She walked to the park yesterday."
sentence2 = "He jumped over the fence."

parsed_sentence1 = verb_fsm.parse_sentence(sentence1)
parsed_sentence2 = verb_fsm.parse_sentence(sentence2)

print("Original Sentences:")
print(sentence1)
print(sentence2)
print("\nParsed Sentences:")
print(parsed_sentence1)
print(parsed_sentence2)


Original Sentences:
She walked to the park yesterday.
He jumped over the fence.

Parsed Sentences:
She walked to the park yesterday.
He jumped over the fence.


***5.Develope a python program that applies the porter stemmer algorithm to a list of words,which stemming process and provide examples of words such as ["jumps","jumping","jumper","jumped","easily","running","files","flying"]before and after stemming.***

In [None]:
from nltk.stem import PorterStemmer

def apply_porter_stemmer(words):
    porter = PorterStemmer()
    stemmed_words = [porter.stem(word) for word in words]
    return stemmed_words
input_words = ["jumps", "jumping", "jumper", "easily", "running", "files", "flying", "files"]


output_words = apply_porter_stemmer(input_words)

print("Original words:")
print(input_words)
print("\nAfter stemming:")
print(output_words)


Original words:
['jumps', 'jumping', 'jumper', 'easily', 'running', 'files', 'flying', 'files']

After stemming:
['jump', 'jump', 'jumper', 'easili', 'run', 'file', 'fli', 'file']


***6.Write a python program that uses NLTK to perform parts-of-speech tagging on a given text"The sun is shinning brightly","I love reading intresting books".***

In [None]:

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_tagging(text):

    words = word_tokenize(text)


    pos_tags = pos_tag(words)

    return pos_tags


text1 = "The sun is shining brightly."
text2 = "I love reading interesting books."

pos_tags1 = pos_tagging(text1)
pos_tags2 = pos_tagging(text2)

print(f"POS tagging for '{text1}': {pos_tags1}")
print(f"POS tagging for '{text2}': {pos_tags2}")

POS tagging for 'The sun is shining brightly.': [('The', 'DT'), ('sun', 'NN'), ('is', 'VBZ'), ('shining', 'VBG'), ('brightly', 'RB'), ('.', '.')]
POS tagging for 'I love reading interesting books.': [('I', 'PRP'), ('love', 'VBP'), ('reading', 'VBG'), ('interesting', 'VBG'), ('books', 'NNS'), ('.', '.')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


***7.Create a Python program that demonstrates stochastic parts-of-speech tagging for the given sentences"The red car stopped at the traffic light","She quickly ran to catch the bus".***

In [None]:
pip install nltk




In [None]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.tag import hmm

sentences = [
    "The red car stopped at the traffic light.",
    "She quickly ran to catch the bus."
]


tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [pos_tag(tokens) for tokens in tokenized_sentences]

trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train(tagged_sentences)

new_sentences = [
    "A black cat crossed the street.",
    "They were waiting for the train at the station."
]

for sentence in new_sentences:
    tokens = word_tokenize(sentence)
    tagged_tokens = tagger.tag(tokens)
    print(tagged_tokens)


[('A', 'DT'), ('black', 'DT'), ('cat', 'DT'), ('crossed', 'DT'), ('the', 'DT'), ('street', 'DT'), ('.', 'DT')]
[('They', 'DT'), ('were', 'DT'), ('waiting', 'DT'), ('for', 'DT'), ('the', 'DT'), ('train', 'DT'), ('at', 'DT'), ('the', 'DT'), ('station', 'DT'), ('.', 'DT')]


***8.Implement a Python program that performs rule based parts-of-speech tagging using regular expressions using the given rules.***

In [None]:
import nltk
from nltk import word_tokenize

patterns = [
    (r'\b(?:The|the)\b', 'DET'),
    (r'\b(?:cat|dog)\b', 'NOUN'),
    (r'\b(?:is|am|are)\b', 'VERB'),
    (r'\b(?:quickly|brightly)\b', 'ADV'),
    (r'\b(?:[A-Za-z]+)\b', 'NOUN')
]

regexp_tagger = nltk.RegexpTagger(patterns)

sentences = [
    "The cat is quick.",
    "The dog is bright.",
    "I am running quickly."
]

for sentence in sentences:
    tokens = word_tokenize(sentence)
    tagged_tokens = regexp_tagger.tag(tokens)
    print(tagged_tokens)


[('The', 'DET'), ('cat', 'NOUN'), ('is', 'VERB'), ('quick', 'NOUN'), ('.', None)]
[('The', 'DET'), ('dog', 'NOUN'), ('is', 'VERB'), ('bright', 'NOUN'), ('.', None)]
[('I', 'NOUN'), ('am', 'VERB'), ('running', 'NOUN'), ('quickly', 'ADV'), ('.', None)]


***9.Develop a python program that uses a PCFG to parse a sentence using"The cat chased the mouse",the probabilities in square brackets indicate the likelihood of each rule need to apply.***

In [None]:
pip install nltk



In [32]:
import nltk

nltk.download('punkt')

grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det N
Det -> 'the'
N -> 'cat' | 'mouse'
VP -> V NP
V -> 'chased'
""")

parser = nltk.ChartParser(grammar)
sentence = ["the", "cat", "chased", "the", "mouse"]

for tree in parser.parse(sentence):
    tree.pretty_print()


              S                 
      ________|_____             
     |              VP          
     |         _____|___         
     NP       |         NP      
  ___|___     |      ___|____    
Det      N    V    Det       N  
 |       |    |     |        |   
the     cat chased the     mouse



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


***10.Write a python program that performs information retrival using the TF-IDF score.***

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

docs = [
    "Natural language processing (NLP) is a field of study in artificial intelligence.",
    "NLP techniques are used in various applications like machine translation and sentiment analysis.",
    "The development of NLP tools and libraries has made text analysis easier."
]

query = "What is natural language processing?"

all_texts = docs + [query]

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(all_texts)

cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

most_similar_index = cosine_similarities.argmax()

print(f"Query: {query}")
print(f"Most Similar Document:\n{docs[most_similar_index]}")


Query: What is natural language processing?
Most Similar Document:
Natural language processing (NLP) is a field of study in artificial intelligence.
