## 1. Create Regular expressions in Python for detecting word patterns and tokenizing text


In [2]:
import re

def detect_word_patterns(text):
    # Regular expression pattern for detecting words
    word_pattern = re.compile(r'\b\w+\b')

    # Find all matches for the word pattern in the text
    words = word_pattern.findall(text)

    return words

def tokenize_text(text):
    # Regular expression pattern for tokenizing text
    token_pattern = re.compile(r'\b\w+\b|\s|[^\w\s]')

    # Find all matches for the token pattern in the text
    tokens = token_pattern.findall(text)

    return tokens

def main():
    # Example text
    text = "Rajalakshmi Institute of Technology was established in 2008. RIT is accredited with highest grade of A++ by NAAC. RIT is affiliated with Anna University Chennai. "

    # Detect word patterns
    words = detect_word_patterns(text)
    print("Words:", words)

    # Tokenize text
    tokens = tokenize_text(text)
    print("Tokens:", tokens)

if __name__ == "__main__":
    main()


Words: ['Rajalakshmi', 'Institute', 'of', 'Technology', 'was', 'established', 'in', '2008', 'RIT', 'is', 'accredited', 'with', 'highest', 'grade', 'of', 'A', 'by', 'NAAC', 'RIT', 'is', 'affiliated', 'with', 'Anna', 'University', 'Chennai']
Tokens: ['Rajalakshmi', ' ', 'Institute', ' ', 'of', ' ', 'Technology', ' ', 'was', ' ', 'established', ' ', 'in', ' ', '2008', '.', ' ', 'RIT', ' ', 'is', ' ', 'accredited', ' ', 'with', ' ', 'highest', ' ', 'grade', ' ', 'of', ' ', 'A', '+', '+', ' ', 'by', ' ', 'NAAC', '.', ' ', 'RIT', ' ', 'is', ' ', 'affiliated', ' ', 'with', ' ', 'Anna', ' ', 'University', ' ', 'Chennai', '.', ' ']


## 2. Getting started with Python and NLTK-Searching Text, Counting Vocabulary, Frequency Distribution, Collocations, Bigrams

### a) Searching Text

In [None]:
# !pip install nltk
import nltk

text = "This is a sample text to demonstrate text searching in NLTK."

# Find all occurrences of the word "text"
text_occurrences = nltk.re.findall(r"\btext\b", text)
print(text_occurrences)  # Output: ['text', 'text']

# Find similar words using a different approach
tokenized_text = nltk.word_tokenize(text.lower())
text_index = nltk.Text(tokenized_text)
similar_words = text_index.similar("text")
print(similar_words)


### b) Counting Vocabulary

In [None]:
from nltk.tokenize import word_tokenize

text = "This is a sample text with some repeated words."

# Tokenize the text into words
tokens = word_tokenize(text.lower())

# Count the unique words (vocabulary)
vocabulary = set(tokens)
print(len(vocabulary))  # Output: 9


### c) Frequency Distribution

In [None]:
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize

text = "This is a sample text with some repeated words."
tokens = word_tokenize(text.lower())

# Create a frequency distribution for the words
fdist = FreqDist(tokens)

# Print the most frequent words
print(fdist.most_common(3))  # Output: [('is', 2), ('this', 1), ('a', 1)]

# Plot the frequency distribution
fdist.plot(cumulative=False)


### d) Collocations

In [None]:
from nltk.collocations import *
import nltk
from nltk.tokenize import word_tokenize

text = "Natural language processing is an exciting field with many applications."
tokens = word_tokenize(text.lower())
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)

# Find the top 5 bigrams with the highest pointwise mutual information (PMI)
print(finder.nbest(bigram_measures.pmi, 5))


### e) Bigrams

In [None]:
# !pip install nltk

from nltk.util import ngrams
from nltk.tokenize import word_tokenize

text = "This is a sample text to demonstrate bigrams."
tokens = word_tokenize(text.lower())

# Generate bigrams (sequences of two consecutive words)
bigrams = ngrams(tokens, 2)
print(list(bigrams))


## 3. Accessing Text Corpora using NLTK in Python


In [4]:
# !pip install nltk

import nltk
from nltk.corpus import gutenberg

# Download the Gutenberg corpus (if not already downloaded)
nltk.download('gutenberg')

def access_gutenberg_corpus():
    # List available files in the Gutenberg corpus
    print("Available files in Gutenberg Corpus:")
    print(gutenberg.fileids())

    # Access and print the text of a specific document in the corpus
    document_name = '/content/shakespeare-hamlet.txt.txt'
    document_text = gutenberg.raw(document_name)
    print(f"\nText of '{document_name}':\n{document_text[:500]}...")

def main():
    # Access the Gutenberg corpus
    access_gutenberg_corpus()

if __name__ == "__main__":
    main()


Available files in Gutenberg Corpus:
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

Text of '/content/shakespeare-hamlet.txt.txt':
THE TRAGEDY OF HAMLET, PRINCE OF DENMARK


by William Shakespeare



Dramatis Personae

  Claudius, King of Denmark.
  Marcellus, Officer.
  Hamlet, son to the former, and nephew to the present king.
  Polonius, Lord Chamberlain.
  Horatio, friend to Hamlet.
  Laertes, son to Polonius.
  Voltemand, courtier.
  Cornelius, courtier.
  Rosencrantz, courtier.
  Guildenstern, courtier.
  Osric, courtier.
  A Gentleman, courtier.
  A Priest.
  Marcellus, officer.
  Bernardo

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


## 4. Write a function that finds the 50 most frequently occurring words of a text words. are not stop

In [None]:
# !pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist

def find_frequent_words(text, num_words=50):

    stop_words = set(stopwords.words('english'))  # Load English stop words
    words = nltk.word_tokenize(text.lower())  # Tokenize text and lowercase words
    filtered_words = [word for word in words if word not in stop_words]  # Filter stop words
    fdist = FreqDist(filtered_words)  # Create frequency distribution
    return fdist.most_common(num_words)  # Return the most common words

# Example usage:
text = "This is a sample text with some common words and some less common words."
frequent_words = find_frequent_words(text)
print(frequent_words)

## 5. Implement the Word2Vec mode

In [8]:
!pip install gensim
# !pip install nltk

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # Download the Punkt tokenizer

# Sample sentences
sentences = [
    "Rajalakshmi Institute of Technology (An Autonomous Institution) is one of the best engineering colleges in Chennai and is part of Rajalakshmi Institution.",
    "Rajalakshmi Institute of Technology was established in 2008.",
    "RIT is accredited with the highest grade of A++ by NAAC. RIT is affiliated with Anna University Chennai.",
    "It is one of the AICTE-approved colleges in Chennai New Delhi and also offers NBA-approved courses."
]

# Tokenize the sentences into words
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Set up and train the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the trained model to a file
model.save("word2vec_model_sentences.bin")

# Load the saved model
loaded_model = Word2Vec.load("word2vec_model_sentences.bin")

# Example of accessing word embeddings
word_embedding = loaded_model.wv['engineering']
print("Word embedding for 'engineering':", word_embedding)


Word embedding for 'engineering': [ 0.00180023  0.00704609  0.0029447  -0.00698085  0.00771268 -0.00598893
  0.00899771  0.0029592  -0.00401529 -0.00468899 -0.00441672 -0.00614646
  0.00937874 -0.0026496   0.00777244 -0.00968034  0.00210879 -0.00123361
  0.00754423 -0.0090546   0.00743756 -0.0051058  -0.00601377 -0.00564916
 -0.00337917 -0.0034111  -0.00319566 -0.0074922   0.00070878 -0.00057607
 -0.001684    0.00375713 -0.00762019 -0.00322142  0.00515534  0.00854386
 -0.00980994  0.00719534  0.00530949 -0.0038797   0.00857616 -0.00922199
  0.00724868  0.00536383  0.00129359 -0.00519975 -0.00417865 -0.00335678
  0.00160829  0.0015867   0.00738824  0.00997759  0.00886734 -0.00400645
  0.00964539 -0.00062954  0.00486543  0.00254902 -0.00062981  0.00366745
 -0.00531941 -0.00575668 -0.00760464  0.00190643  0.00652587  0.00088213
  0.00125695  0.0031716   0.00813467 -0.00770006  0.00226075 -0.00747411
  0.00370981  0.00951055  0.00752026  0.00642603  0.00801478  0.00655115
  0.00685668  0.0

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 6. Use a transformer for implementing classification

In [11]:
!pip install torch
!pip install transformers
!pip install sklearn
!pip install tqdm

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Sample data for text classification
texts = ["This is a positive example.", "This is a negative example.", "Another positive one.", "Negative text here."]
labels = [1, 0, 1, 0]

# Split data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode the training data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
train_labels = torch.tensor(train_labels)

# Tokenize and encode the testing data
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')
test_labels = torch.tensor(test_labels)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

for epoch in range(3):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()

        predictions.extend(predicted_labels)
        true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2/2 [00:10<00:00,  5.35s/it]
Epoch 2: 100%|██████████| 2/2 [00:05<00:00,  2.61s/it]
Epoch 3: 100%|██████████| 2/2 [00:03<00:00,  1.77s/it]
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.59it/s]

Accuracy: 0.00%





## 7. Design a chatbot with a simple dialog system

In [10]:
import random

class SimpleChatbot:
    def __init__(self):
        self.greetings = ['hello', 'hi', 'hey', 'greetings', 'howdy']
        self.goodbyes = ['bye', 'goodbye', 'see you', 'farewell']
        self.responses = {
            'tell me a joke': 'Why did the chicken cross the road? To get to the other side!',
            'how are you': 'I am just a computer program, but thanks for asking!',
'default': 'I\'m sorry, I don\'t understand that. Can you ask me something else?'

        }

    def get_response(self, user_input):
        user_input = user_input.lower()

        if any(greeting in user_input for greeting in self.greetings):
            return 'Hello! How can I help you today?'

        elif any(goodbye in user_input for goodbye in self.goodbyes):
            return 'Goodbye! Have a great day.'

        else:
            for key in self.responses:
                if key in user_input:
                    return self.responses[key]

            return self.responses['default']

def main():
    chatbot = SimpleChatbot()

    print("Simple Chatbot: Hello! Ask me anything or say goodbye to end the conversation.")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ['bye', 'goodbye', 'exit']:
            print("Simple Chatbot: Goodbye! Have a great day.")
            break

        response = chatbot.get_response(user_input)
        print("Simple Chatbot:", response)

if __name__ == "__main__":
    main()


Simple Chatbot: Hello! Ask me anything or say goodbye to end the conversation.
You: hi
Simple Chatbot: Hello! How can I help you today?
You: how are you!?
Simple Chatbot: I am just a computer program, but thanks for asking!
You: bye
Simple Chatbot: Goodbye! Have a great day.


## 8. Convert text to speech and find accuracy

In [1]:
!pip install SpeechRecognition
!pip install gTTS
!pip install pyaudio


import speech_recognition as sr
from gtts import gTTS
import os

def text_to_speech(text, language='en'):
    tts = gTTS(text=text, lang=language, slow=False)
    tts.save("output.mp3")
    os.system("start output.mp3")  # This opens the file using the default media player

def speech_to_text():
    recognizer = sr.Recognizer()

    with sr.Microphone() as source:
        print("Say something:")
        audio = recognizer.listen(source)

    try:
        text = recognizer.recognize_google(audio)
        return text
    except sr.UnknownValueError:
        print("Sorry, could not understand audio.")
        return None
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return None

def evaluate_accuracy(original_text, recognized_text):
    if recognized_text:
        print(f"Original Text: {original_text}")
        print(f"Recognized Text: {recognized_text}")

        original_words = set(original_text.lower().split())
        recognized_words = set(recognized_text.lower().split())

        common_words = original_words.intersection(recognized_words)
        accuracy = len(common_words) / len(original_words)
        print(f"Accuracy: {accuracy * 100:.2f}%")
    else:
        print("No text recognized. Accuracy cannot be calculated.")

if __name__ == "__main__":
    original_text = "Hello, how are you today?"

    # Convert text to speech
    text_to_speech(original_text)

    # Speech to text
    recognized_text = speech_to_text()

    # Evaluate accuracy
    evaluate_accuracy(original_text, recognized_text)





[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting gTTS
  Downloading gTTS-2.5.0-py3-none-any.whl (29 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
                                              0.0/97.9 kB ? eta -:--:--
     -------------------------                61.4/97.9 kB 1.1 MB/s eta 0:00:01
     -------------------------------------- 97.9/97.9 kB 933.0 kB/s eta 0:00:00
Installing collected packages: click, gTTS
Successfully installed click-8.1.7 gTTS-2.5.0



[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pyaudio
  Downloading PyAudio-0.2.14-cp311-cp311-win_amd64.whl (164 kB)
                                              0.0/164.1 kB ? eta -:--:--
     -------------------------------------  163.8/164.1 kB 5.0 MB/s eta 0:00:01
     -------------------------------------- 164.1/164.1 kB 3.3 MB/s eta 0:00:00
Installing collected packages: pyaudio
Successfully installed pyaudio-0.2.14
Say something:
Original Text: Hello, how are you today?
Recognized Text: hello how are you today
Accuracy: 60.00%


## 9. Design a speech recognition system and find the error rate

In [None]:
!pip install jiwer

import speech_recognition as sr
import jiwer

def recognize_speech(audio_file, language='en-US'):
    recognizer = sr.Recognizer()

    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)

    try:
        recognized_text = recognizer.recognize_google(audio, language=language)
        return recognized_text
    except sr.UnknownValueError:
        print("Speech recognition could not understand the audio.")
        return None
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service; {e}")
        return None

def calculate_word_error_rate(reference_text, recognized_text):
    wer = jiwer.wer(reference_text, recognized_text)
    return wer

if __name__ == "__main__":
    # Simulating a reference text
    reference_text = "hello how are you"

    # Simulating a recognized text (replace 'audio_file.wav' with the path to your actual audio file)
    audio_file_path = 'audio_file.wav'
    recognized_text = recognize_speech(audio_file_path)

    if recognized_text:
        print(f"Reference Text: {reference_text}")
        print(f"Recognized Text: {recognized_text}")

        # Calculate Word Error Rate (WER)
        wer = calculate_word_error_rate(reference_text, recognized_text)
        print(f"Word Error Rate (WER): {wer * 100:.2f}%")
    else:
        print("No text recognized.")