# Find word tokens and frequency

In [None]:
%%writefile eng.txt
Hello world! This is a sample text file.
This text file is used to test the word frequency counter.
Hello again! Let's count how many times each word appears in this file.
The word 'Hello' appears multiple times in this text file.

Writing eng.txt


In [None]:
file_path = '/content/eng.txt'

In [None]:

def count_word_frequency(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().lower()


        words = text.split()
        total_tokens = len(words)

        word_count = {}
        for word in words:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

        total_types = len(word_count)

        print(f"\n{'Word':<20} {'Frequency':<10}")
        print('-' * 30)
        for word, count in word_count.items():
            print(f"{word:<20} {count:<10}")

        print("\n\n")
        print(f"Total Tokens (all words): {total_tokens}")
        print(f"Total Types (unique words): {total_types}")

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:

# Run the function
count_word_frequency(file_path)



Word                 Frequency 
------------------------------
hello                2         
world!               1         
this                 4         
is                   2         
a                    1         
sample               1         
text                 3         
file.                3         
file                 1         
used                 1         
to                   1         
test                 1         
the                  2         
word                 3         
frequency            1         
counter.             1         
again!               1         
let's                1         
count                1         
how                  1         
many                 1         
times                2         
each                 1         
appears              2         
in                   2         
'hello'              1         
multiple             1         



Total Tokens (all words): 42
Total Types (unique words): 27


# For Urdu

In [None]:
%%writefile urdu.txt
یہ ایک مثال ہے۔ یہ مثال الفاظ کی گنتی کے لیے ہے۔
یہ الفاظ کو گننے کا ایک آسان طریقہ ہے۔

Writing urdu.txt


In [None]:
import re

file_path = '/content/urdu.txt'

def count_word_frequency(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read().strip()


        text = re.sub(r'[^\w\s\u0600-\u06FF]', '', text)
        words = text.split()

        total_tokens = len(words)


        word_count = {}
        for word in words:
            if word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1

        total_types = len(word_count)

        # Print results
        print(f"\n{'Word':<30} {'Frequency':<10}")
        print('-' * 50)

        print(f"Total Tokens (all words): {total_tokens}\n")
        print(f"Total Types (unique words): {total_types}\n")

        with open("urdu-output.txt", 'w', encoding='utf-8') as out:
            out.write(f"{'Word':<30} {'Frequency':<10}\n")
            out.write('-' * 50 + '\n')
            for word, count in word_count.items():
                out.write(f"{word:<30} {count:<10}\n")

            print("\nStatistics:\n")
            print(f"{'Word':<30} {'Frequency':<10}\n")
            print('-' * 50 + '\n')
            for word, count in word_count.items():
                print(f"{word:<30} {count:<10}\n")
            out.write(f"Total Tokens (all words): {total_tokens}\n")
            out.write(f"Total Types (unique words): {total_types}\n")

        print(f"Results saved to urdu-output.txt")

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function
count_word_frequency(file_path)



Word                           Frequency 
--------------------------------------------------
Total Tokens (all words): 21

Total Types (unique words): 14


Statistics:

Word                           Frequency 

--------------------------------------------------

یہ                             3         

ایک                            2         

مثال                           2         

ہے۔                            3         

الفاظ                          2         

کی                             1         

گنتی                           1         

کے                             1         

لیے                            1         

کو                             1         

گننے                           1         

کا                             1         

آسان                           1         

طریقہ                          1         

Results saved to urdu-output.txt


# Joiners & Non-Joiners

In [4]:
joiners = [
    "ب", "پ", "ت", "ث", "ج", "چ", "ح", "خ", "س", "ش", "ص", "ض",
    "ط", "ظ", "ع", "غ", "ف", "ق", "ک", "گ", "ل", "م", "ن", "ہ", "ی"
]

print(joiners)


['ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'ہ', 'ی']


In [3]:
non_joiners = [
    "ا", "د", "ڈ", "ذ", "ر", "ڑ", "ز", "ژ", "و"
]

print(non_joiners)


['ا', 'د', 'ڈ', 'ذ', 'ر', 'ڑ', 'ز', 'ژ', 'و']
