In [None]:
#Question No.1
import nltk
from nltk.corpus import gutenberg
import re

In [None]:
def show_regex_matches(pattern, text):
  matches = re.findall(pattern, text)
  print(f"\nMatches for pattern {pattern}")
  print(' '.join(matches))
  print("Length: ", len(matches))

In [None]:
#Testing with a custom text
text = "Cat cat Hello coat cut cute Dog cart"
show_regex_matches(r'[a-z]+', text)
show_regex_matches(r'[A-Z][a-z]+', text)
show_regex_matches(r'c[aeiou]{1,2}t', text)

In [None]:
#Testing with gutenberg text
nltk.download('gutenberg')
text = gutenberg.raw('austen-emma.txt')
show_regex_matches(r'[a-z]+', text)
show_regex_matches(r'[A-Z][a-z]+', text)
show_regex_matches(r'c[aeiou]{1,2}t', text)


In [None]:
#Question No.2
from urllib import request
from bs4 import BeautifulSoup

In [None]:
def get_text_from_url(url):
  html = request.urlopen(url).read().decode('utf8')
  return BeautifulSoup(html, 'html.parser').get_text()

In [None]:
url = "https://www.dsu.edu.pk/contact-us/"
text = get_text_from_url(url)
print(text.strip())

In [None]:
#Question No.3
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def get_word_tokens(text):
  return word_tokenize(text)

In [None]:
tokens = get_word_tokens(text)
print("Word Tokens: ", tokens)

In [None]:
#Extracting Phone Numbers from Text
pattern = r'(\(?\d{3,4}\)?[\s\-]?\d{7,8}[\s\-]?\d?\d?)'
print("Phone Numbers")
show_regex_matches(pattern, text)




In [None]:
#Extracting Emails from Text
pattern = r'[a-zA-Z0-9._%+-]+\@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
print("Emails")
show_regex_matches(pattern, text)

In [None]:
#Question No.4
from nltk.stem import PorterStemmer, LancasterStemmer

In [None]:
text = "The runner was running and the cats were chasing mice."
tokens = get_word_tokens(text)
print("Word Tokens: ", tokens)

In [None]:
porter = PorterStemmer()
porter_stemmed_tokens = [porter.stem(token) for token in tokens]
print("Porter Stemmed Tokens: ", porter_stemmed_tokens)

In [None]:
lancaster = LancasterStemmer()
lancaster_stemmed_tokens = [lancaster.stem(token) for token in tokens]
print("Lancaster Stemmed Tokens: ", lancaster_stemmed_tokens)

In [None]:
#Question No.5
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [None]:
df = pd.read_csv('/content/Sentiment Dataset Urdu - Sentiment Dataset Urdu.csv', encoding='utf-8')
df.head()

In [None]:
#text = df['Text'].to_string(index=False, header=False).strip().replace('\n', ' ')
text = df['Text']
print(text)

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

In [None]:
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.train_from_iterator(text, trainer)
vocab = tokenizer.get_vocab()
print("Vocab: ", vocab)
print("Vocab Size: ", len(vocab))

In [None]:
output_tokens = tokenizer.encode("پچھلی بارشوں کے باعث شہر کی سڑکیں تالاب میں تبدیل ہو گئیں اور حکومتیں کچھ نہ کر سکیں۔ شہریوں کو اپنی املاک اور جانوں کا نقصان اٹھانا پڑا۔ زندگی مفلوج ہو گئی اور روزمرہ کی مشکلات میں اضافہ ہوا۔")
print("Output Tokens: ", output_tokens.tokens)
print("Output Tokens Size: ", len(output_tokens))
bpe_tokens = output_tokens.tokens

In [None]:
dictionary = set()
for key, value in vocab.items():
  dictionary.add(key)

In [None]:
def max_match_segment(text, dictionary):
    segmented = []
    i = 0
    while i < len(text):
        for j in range(len(text), i, -1):
            word = text[i:j]
            for dic in dictionary:
                if dic == word:
                    segmented.append(word)
                    i = j
                    break
        else:
            segmented.append(text[i])
            i += 1
    return segmented


In [None]:
text = "پچھلی بارشوں کے باعث شہر کی سڑکیں تالاب میں تبدیل ہو گئیں اور حکومتیں کچھ نہ کر سکیں۔ شہریوں کو اپنی املاک اور جانوں کا نقصان اٹھانا پڑا۔ زندگی مفلوج ہو گئی اور روزمرہ کی مشکلات میں اضافہ ہوا۔"
segmented_words = max_match_segment(text, dictionary)
print("Output Tokens:", segmented_words)
print("Output Tokens Size: ", len(segmented_words))
mm_tokens = segmented_words

In [None]:
import numpy as np
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd

def train_lm(train_tokens, n):
    train_data, padded_vocab = padded_everygram_pipeline(n, [train_tokens])
    print("Training Padded Vocabulary:", list(padded_vocab))
    for data in train_data:
        print("Training Data: ", list(data))
    model = MLE(n)
    model.fit(train_data, padded_vocab)
    return model

def compute_perplexity(model, test_tokens, n):
    test_data, _ = padded_everygram_pipeline(n, [test_tokens])
    total_log_prob = 0
    num_tokens = 0

    for data in test_data:
        num_tokens += len(list(data))
        for ngram in list(data):
            prob = model.score(ngram[-1], *ngram[:-1])  # For n-grams
            total_log_prob += np.log(prob)

    # Calculate perplexity
    entropy = -total_log_prob / num_tokens
    perplexity = np.exp2(entropy)

    return perplexity

train_tokens = list(dictionary)
print("Train Tokens: ", train_tokens)

test_text = "دو دن کی بارش میں سارا” بھرم“ بہہ گیا۔"
test_tokens = get_word_tokens(test_text)
print("Test Tokens: ", test_tokens)

# Train unigram model
unigram_model = train_lm(train_tokens, 1)
unigram_perplexity = compute_perplexity(unigram_model, test_tokens, 1)
print("Unigram Model Perplexity:", unigram_perplexity)

# Train bigram model
bigram_model = train_lm(train_tokens, 2)
bigram_perplexity = compute_perplexity(bigram_model, test_tokens, 2)
print("Bigram Model Perplexity:", bigram_perplexity)

# Train trigram model
trigram_model = train_lm(train_tokens, 3)
trigram_perplexity = compute_perplexity(trigram_model, test_tokens, 3)
print("Trigram Model Perplexity:", trigram_perplexity)
