In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from tabulate import tabulate

def preprocess_text(text: str) -> list[str]:
    """Preprocesses the text by tokenizing, removing punctuation and stopwords."""
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:
    """Calculates the mean probability (μ-value) of each bigram."""
    mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}
    return mean_probabilities

def main():
    # Download required NLTK data
    # nltk.download('punkt')
    # nltk.download('stopwords')

    # Load text
    with open(r"E:\126156072\NLP\sample.txt", 'r') as file:
        text = file.read()

    # Preprocess text
    words = preprocess_text(text)

    # Calculate word frequency distribution
    fdist = FreqDist(words)

    # Calculate bigrams and their frequencies
    bigrams = list(nltk.bigrams(words))
    bigram_freq = FreqDist(bigrams)

    # Calculate mean probability (μ-value) for each bigram
    mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))

    # Sort collocations by mean probability
    collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)

    # Calculate mean of bigram frequencies
    total_bigram_freq = sum(bigram_freq.values())
    mean_bigram_freq = total_bigram_freq / len(bigram_freq)

    # Print mean bigram frequency
    print(f"Mean Bigram Frequency: {mean_bigram_freq:.2f}\n")

    # Print top N collocations with their frequencies and mean probabilities
    N = 10
    headers = ["Rank", "Bigram", "Frequency", "Mean Probability (μ-value)"]
    table = []
    for i, (bigram, mean_prob) in enumerate(collocations[:N]):
        table.append([i+1, bigram, bigram_freq[bigram], f"{mean_prob:.6f}"])
    print(tabulate(table, headers, tablefmt="orgtbl"))

if __name__ == "__main__":
    main()

Mean Bigram Frequency: 1.03

|   Rank | Bigram                 |   Frequency |   Mean Probability (μ-value) |
|--------+------------------------+-------------+------------------------------|
|      1 | ('clever', 'fox')      |           2 |                     0.011429 |
|      2 | ('forest', 'full')     |           2 |                     0.011429 |
|      3 | ('forest', 'fox')      |           2 |                     0.011429 |
|      4 | ('realized', 'forest') |           2 |                     0.011429 |
|      5 | ('new', 'friends')     |           2 |                     0.011429 |
|      6 | ('upon', 'time')       |           1 |                     0.005714 |
|      7 | ('time', 'lush')       |           1 |                     0.005714 |
|      8 | ('lush', 'green')      |           1 |                     0.005714 |
|      9 | ('green', 'forest')    |           1 |                     0.005714 |
|     10 | ('forest', 'lived')    |           1 |                     0.005714 |

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from tabulate import tabulate
from scipy.stats import ttest_1samp, chi2_contingency
import numpy as np

def preprocess_text(text: str) -> list[str]:
    """Preprocesses the text by tokenizing, removing punctuation and stopwords."""
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:
    """Calculates the mean probability (μ-value) of each bigram."""
    mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}
    return mean_probabilities

def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):
    """Perform t-test and chi-square test for each bigram."""
    results = []

    for bigram, observed_freq in bigram_freq.items():
        word1, word2 = bigram
        freq_w1 = word_freq.get(word1, 0)
        freq_w2 = word_freq.get(word2, 0)
        
        # Expected frequency for the bigram assuming independence
        expected_freq = (freq_w1 * freq_w2) / total_bigrams
        
        # Chi-square test
        observed = np.array([
            [observed_freq, freq_w1 - observed_freq],
            [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]
        ])
        
        try:
            chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)
        except ValueError:
            chi2_stat, p_value_chi2 = np.nan, np.nan
        
        # Generate a list of observed frequencies to simulate multiple observations for t-test
        observed_frequencies = [observed_freq] * 10  # Simulating 10 observations
        
        # Perform one-sample t-test
        t_stat, p_value_t = ttest_1samp(observed_frequencies, expected_freq)
        
        results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))
    
    return results

def main():
    # Download required NLTK data
    # nltk.download('punkt')
    # nltk.download('stopwords')

    # Load text
    with open(r"E:\126156072\NLP\text1.txt", 'r') as file:
        text = file.read()

    # Preprocess text
    words = preprocess_text(text)

    # Calculate word frequency distribution
    word_freq = FreqDist(words)

    # Calculate bigrams and their frequencies
    bigrams = list(nltk.bigrams(words))
    bigram_freq = FreqDist(bigrams)

    # Calculate mean probability (μ-value) for each bigram
    mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))

    # Sort collocations by mean probability
    collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)

    # Calculate mean of bigram frequencies
    total_bigram_freq = sum(bigram_freq.values())
    mean_bigram_freq = total_bigram_freq / len(bigram_freq)

    # Print mean bigram frequency
    print(f"Mean Bigram Frequency: {mean_bigram_freq:.2f}\n")

    # Perform statistical tests for each bigram
    results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))

    # Print top N collocations with their frequencies and mean probabilities
    N = 10
    headers = ["Rank", "Bigram", "Frequency", "Mean Probability (μ-value)", "t-Statistic", "p-Value (t-Test)", "Chi2 Statistic", "p-Value (Chi-Square)"]
    table = []
    for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):
        table.append([
            i+1, 
            bigram, 
            observed_freq, 
            f"{mean_probabilities.get(bigram, 0):.6f}", 
            f"{t_stat:.4f}", 
            f"{p_value_t:.4f}", 
            f"{chi2_stat:.4f}", 
            f"{p_value_chi2:.4f}"
        ])
    
    print(tabulate(table, headers, tablefmt="orgtbl"))

if __name__ == "__main__":
    main()


Mean Bigram Frequency: 1.17

|   Rank | Bigram                         |   Frequency |   Mean Probability (μ-value) |   t-Statistic |   p-Value (t-Test) |   Chi2 Statistic |   p-Value (Chi-Square) |
|--------+--------------------------------+-------------+------------------------------+---------------+--------------------+------------------+------------------------|
|      1 | ('impact', 'artificial')       |           2 |                     0.002699 |           inf |                  0 |         183.747  |                 0      |
|      2 | ('artificial', 'intelligence') |           3 |                     0.004049 |           inf |                  0 |         307.077  |                 0      |
|      3 | ('intelligence', 'data')       |           3 |                     0.004049 |           inf |                  0 |          12.4097 |                 0.0004 |
|      4 | ('data', 'science')            |          15 |                     0.020243 |           inf |                 

In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import bigrams
from collections import Counter
import numpy as np
from scipy.stats import ttest_1samp, chi2_contingency
from tabulate import tabulate

# Download the necessary datasets if you haven't already

# Define the set of English stopwords
english_stops = set(stopwords.words('english'))

# Function to process the text file
def process_text_file(file_path):
    """Reads the text from the file, tokenizes, and filters stopwords."""
    with open(file_path, 'r') as file:
        text = file.read()
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.lower() not in english_stops and word.isalpha()]
    return filtered_words

# Function to calculate word and bigram frequencies
def calculate_frequencies(words):
    """Calculates word and bigram frequencies."""
    word_freq = Counter(words)
    bigram_list = list(bigrams(words))
    bigram_freq = Counter(bigram_list)
    return word_freq, bigram_freq

# Function to calculate the mean frequency of each bigram
def calculate_bigram_means(bigram_freq, total_bigrams):
    """Calculates the mean frequency of each bigram."""
    bigram_means = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}
    return bigram_means

# Function to perform t-Test and Chi-Square test for each bigram
def perform_statistical_tests(filtered_words):
    """Performs t-Test and Chi-Square test for each bigram."""
    bigram_list = list(bigrams(filtered_words))
    bigram_freq = Counter(bigram_list)
    word_freq = Counter(filtered_words)
    corpus_size = len(filtered_words)
    
    results = []
    
    for bigram, freq_bi in bigram_freq.items():
        freq_w1 = word_freq[bigram[0]]
        freq_w2 = word_freq[bigram[1]]
        expected_freq = (freq_w1 * freq_w2) / corpus_size
        
        if freq_bi == 0:
            freq_bi = 1
        
        observed = np.array([
            [freq_bi, freq_w1 - freq_bi],
            [freq_w2 - freq_bi, corpus_size - (freq_w1 + freq_w2 - freq_bi)]
        ])
        
        try:
            chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)
        except ValueError:
            chi2_stat, p_value_chi2 = np.nan, np.nan
        
        observed_frequencies = [freq_bi] * 10
        t_stat, p_value_t = ttest_1samp(observed_frequencies, expected_freq)
        
        results.append((bigram, freq_bi, t_stat, p_value_t, chi2_stat, p_value_chi2))
    
    return results

def main():
    # Path to the text file
    file_path = r"E:\126156072\NLP\text1.txt"
    
    # Process the text file
    filtered_words = process_text_file(file_path)
    
    # Calculate frequencies
    word_freq, bigram_freq = calculate_frequencies(filtered_words)
    
    # Calculate total number of bigrams
    total_bigrams = sum(bigram_freq.values())
    
    # Calculate mean frequency of each bigram
    bigram_means = calculate_bigram_means(bigram_freq, total_bigrams)
    
    # Sort bigrams by mean probability and get the top 5
    top_bigrams = sorted(bigram_means.items(), key=lambda x: x[1], reverse=True)[:10]
    
    # Perform statistical tests
    results = perform_statistical_tests(filtered_words)
    
    # Filter results to include only the top 5 bigrams
    results_dict = {bigram: (freq_bi, t_stat, p_value_t, chi2_stat, p_value_chi2) for bigram, freq_bi, t_stat, p_value_t, chi2_stat, p_value_chi2 in results}
    
    filtered_results = [(bigram, results_dict[bigram][0], results_dict[bigram][1], results_dict[bigram][2], results_dict[bigram][3], results_dict[bigram][4]) for bigram, _ in top_bigrams]
    
    # Print results
    headers = [ "Bigram", "Frequency", "Mean Probability (μ-value)", "t-Statistic", "p-Value (t-Test)", "Chi2 Statistic", "p-Value (Chi-Square)"]
    table = []
    for i, (bigram, freq_bi, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(filtered_results):
        table.append([
          
            bigram, 
            freq_bi, 
            f"{bigram_means.get(bigram, 0):.6f}", 
            f"{t_stat:.4f}", 
            f"{p_value_t:.4f}", 
            f"{chi2_stat:.4f}", 
            f"{p_value_chi2:.4f}"
        ])
    
    print(tabulate(table, headers, tablefmt="orgtbl"))

if __name__ == "__main__":
    main()


| Bigram                      |   Frequency |   Mean Probability (μ-value) |   t-Statistic |   p-Value (t-Test) |   Chi2 Statistic |   p-Value (Chi-Square) |
|-----------------------------+-------------+------------------------------+---------------+--------------------+------------------+------------------------|
| ('data', 'science')         |          15 |                     0.020243 |           inf |                  0 |         167.725  |                 0      |
| ('data', 'processing')      |           7 |                     0.009447 |           inf |                  0 |          32.6757 |                 0      |
| ('predictive', 'analytics') |           5 |                     0.006748 |           inf |                  0 |         499.265  |                 0      |
| ('data', 'visualization')   |           5 |                     0.006748 |           inf |                  0 |          37.8899 |                 0      |
| ('ai', 'data')              |           4 |       

In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from tabulate import tabulate
from scipy.stats import chi2_contingency, ttest_1samp
import numpy as np

def preprocess_text(text: str) -> list[str]:
    """Preprocesses the text by tokenizing, removing punctuation and stopwords."""
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

def calculate_mean_probability(bigram_freq: FreqDist, total_bigrams: int) -> dict:
    """Calculates the mean probability (μ-value) of each bigram."""
    mean_probabilities = {bigram: freq / total_bigrams for bigram, freq in bigram_freq.items()}
    return mean_probabilities

def perform_statistical_tests(bigram_freq: FreqDist, word_freq: FreqDist, total_bigrams: int):
    """Perform t-test and chi-square test for each bigram."""
    results = []

    for bigram, observed_freq in bigram_freq.items():
        word1, word2 = bigram
        freq_w1 = word_freq.get(word1, 0)
        freq_w2 = word_freq.get(word2, 0)
        
        # Expected frequency for the bigram assuming independence
        expected_freq = (freq_w1 * freq_w2) / total_bigrams
        
        # Chi-square test
        observed = np.array([
            [observed_freq, freq_w1 - observed_freq],
            [freq_w2 - observed_freq, total_bigrams - (freq_w1 + freq_w2 - observed_freq)]
        ])
        
        try:
            chi2_stat, p_value_chi2, dof, ex = chi2_contingency(observed)
        except ValueError:
            chi2_stat, p_value_chi2 = np.nan, np.nan
        
        # Generate sample data to perform t-test
        sample_data = [observed_freq] * observed_freq + [expected_freq] * (total_bigrams - observed_freq)
        
        # Perform one-sample t-test
        t_stat, p_value_t = ttest_1samp(sample_data, expected_freq)
        
        results.append((bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2))
    
    return results

def main():
    # Download required NLTK data
    # nltk.download('punkt')
    # nltk.download('stopwords')

    # Load text
    with open(r"E:\126156072\NLP\sample.txt", 'r') as file:
        text = file.read()

    # Preprocess text
    words = preprocess_text(text)

    # Calculate word frequency distribution
    word_freq = FreqDist(words)

    # Calculate bigrams and their frequencies
    bigrams = list(nltk.bigrams(words))
    bigram_freq = FreqDist(bigrams)

    # Calculate mean probability (μ-value) for each bigram
    mean_probabilities = calculate_mean_probability(bigram_freq, len(bigrams))

    # Sort collocations by mean probability
    collocations = sorted(mean_probabilities.items(), key=lambda x: x[1], reverse=True)

    # Calculate mean of bigram frequencies
    total_bigram_freq = sum(bigram_freq.values())
    mean_bigram_freq = total_bigram_freq / len(bigram_freq)

    # Print mean bigram frequency
    print(f"Mean Bigram Frequency: {mean_bigram_freq:.2f}\n")

    # Perform statistical tests for each bigram
    results = perform_statistical_tests(bigram_freq, word_freq, len(bigrams))

    # Print top N collocations with their frequencies and mean probabilities
    N = 10
    headers = ["Rank", "Bigram", "Frequency", "Mean Prob(μ)", "t-Statistic", "p-Value(t-Test)", "Chi Square", "p-Value(Chi-Square)"]
    table = []
    for i, (bigram, observed_freq, t_stat, p_value_t, chi2_stat, p_value_chi2) in enumerate(results[:N]):
        table.append([
            i + 1,
            bigram,
            observed_freq,
            f"{mean_probabilities.get(bigram, 0):.6f}",
            f"{t_stat:.4f}" if not np.isnan(t_stat) else "NaN",
            f"{p_value_t:.4f}" if not np.isnan(p_value_t) else "NaN",
            f"{chi2_stat:.4f}" if not np.isnan(chi2_stat) else "NaN",
            f"{p_value_chi2:.4f}" if not np.isnan(p_value_chi2) else "NaN"
        ])
    print(tabulate(table, headers, tablefmt="grid"))

if __name__ == "__main__":
    main()

Mean Bigram Frequency: 1.03

+--------+------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+
|   Rank | Bigram                 |   Frequency |   Mean Prob(μ) |   t-Statistic |   p-Value(t-Test) |   Chi Square |   p-Value(Chi-Square) |
|      1 | ('upon', 'time')       |           1 |       0.005714 |        1      |            0.3187 |      43.2486 |                0      |
+--------+------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+
|      2 | ('time', 'lush')       |           1 |       0.005714 |        1      |            0.3187 |      43.2486 |                0      |
+--------+------------------------+-------------+----------------+---------------+-------------------+--------------+-----------------------+
|      3 | ('lush', 'green')      |           1 |       0.005714 |        1      |            0.3187 |      43.2486 |  