In [9]:
pip install lexical-diversity

Collecting lexical-diversity
  Downloading lexical_diversity-0.1.1-py3-none-any.whl.metadata (4.1 kB)
Downloading lexical_diversity-0.1.1-py3-none-any.whl (117 kB)
Installing collected packages: lexical-diversity
Successfully installed lexical-diversity-0.1.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Load the dataset
data = load_data('df_all_group3.csv')

# Extract the 'sentence' column
sentences = data['sentence'].dropna().tolist()

# Tokenize each sentence into words
words = [word_tokenize(sent) for sent in sentences]
words = [word for sublist in words for word in sublist]

# Calculate the number of sentences
num_sentences = len(sentences)

# Calculate the average sentence length
avg_sentence_length = np.mean([len(word_tokenize(sent)) for sent in sentences])

# Calculate the number of word tokens
num_word_tokens = len(words)

# Calculate the number of word types
num_word_types = len(set(words))

# Calculate Yule’s characteristic K
word_counts = Counter(words)
V_m = Counter(word_counts.values())
K = 10**4 * (sum(m**2 * V_m[m] for m in V_m) - num_word_tokens) / num_word_tokens**2

# Display results
results = {
    'Number of Sentences': num_sentences,
    'Average Sentence Length': avg_sentence_length,
    'Number of Word Tokens': num_word_tokens,
    'Number of Word Types': num_word_types,
    'Yule’s Characteristic K': K
}

# Print results as a DataFrame
results_df = pd.DataFrame([results])
print(results_df)


   Number of Sentences  Average Sentence Length  Number of Word Tokens   
0                 4024                 9.628728                  38746  \

   Number of Word Types  Yule’s Characteristic K  
0                  6156               105.434295  


In [2]:
results_df

Unnamed: 0,Number of Sentences,Average Sentence Length,Number of Word Tokens,Number of Word Types,Yule’s Characteristic K
0,4024,9.628728,38746,6156,105.434295


In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Function to analyze text data for a specific label
def analyze_text(sentences):
    words = [word_tokenize(sent) for sent in sentences]
    words = [word for sublist in words for word in sublist]

    num_sentences = len(sentences)
    avg_sentence_length = np.mean([len(word_tokenize(sent)) for sent in sentences])
    num_word_tokens = len(words)
    num_word_types = len(set(words))

    word_counts = Counter(words)
    V_m = Counter(word_counts.values())
    K = 10**4 * (sum(m**2 * V_m[m] for m in V_m) - num_word_tokens) / num_word_tokens**2

    return {
        'Number of Sentences': num_sentences,
        'Average Sentence Length': avg_sentence_length,
        'Number of Word Tokens': num_word_tokens,
        'Number of Word Types': num_word_types,
        'Yule’s Characteristic K': K
    }

# Load the dataset
data = load_data('df_all_group3.csv')

# # Ensure 'label' column is treated as string
data['label'] = data['label'].astype(str)

# # Label mapping
label_mapping = {'n': 0, 'na': 1, 'k': 2, 'ka': 3}
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Map textual labels to numerical labels
# data['numerical_label'] = data['class'].map(label_mapping)
data['numerical_label'] = data['label']

# Analyze text data for each label
results = []
for numerical_label, text_label in inverse_label_mapping.items():
    group = data[data['numerical_label'] == numerical_label]
    sentences = group['sentence'].dropna().tolist()
    if sentences:  # Ensure there are sentences to analyze
        analysis = analyze_text(sentences)
        analysis['Numerical Label'] = numerical_label
        analysis['Text Label'] = text_label
        results.append(analysis)

# Perform analysis on the entire dataset
all_sentences = data['sentence'].dropna().tolist()
total_analysis = analyze_text(all_sentences)
total_analysis['Numerical Label'] = 'Total'
total_analysis['Text Label'] = 'Total'

# Append the total analysis to the results
results.append(total_analysis)

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)

# Reorder columns to have 'Numerical Label' and 'Text Label' first
results_df = results_df[['Numerical Label', 'Text Label', 'Number of Sentences', 'Average Sentence Length', 'Number of Word Tokens', 'Number of Word Types', 'Yule’s Characteristic K']]

print(results_df)


  Numerical Label Text Label  Number of Sentences  Average Sentence Length   
0           Total      Total                 4024                 9.628728  \

   Number of Word Tokens  Number of Word Types  Yule’s Characteristic K  
0                  38746                  6156               105.434295  


In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Function to analyze text data for a specific label
def analyze_text(sentences):
    words = [word_tokenize(sent) for sent in sentences]
    words = [word for sublist in words for word in sublist]

    num_sentences = len(sentences)
    avg_sentence_length = np.mean([len(word_tokenize(sent)) for sent in sentences])
    num_word_tokens = len(words)
    num_word_types = len(set(words))

    word_counts = Counter(words)
    V_m = Counter(word_counts.values())
    K = 10**4 * (sum(m**2 * V_m[m] for m in V_m) - num_word_tokens) / num_word_tokens**2

    return {
        'Number of Sentences': num_sentences,
        'Average Sentence Length': avg_sentence_length,
        'Number of Word Tokens': num_word_tokens,
        'Number of Word Types': num_word_types,
        'Yule’s Characteristic K': K
    }

# Load the dataset
data = load_data('df_all_group3.csv')

# Ensure 'label' column is treated as string
data['label'] = data['label'].astype(str)

# Label mapping based on the table (assuming labels 0, 1, 2, 3 match with ngoko, ngoko alus, krama, krama alus)
label_mapping = {0: 'Ngoko', 1: 'Ngoko Alus', 2: 'Krama', 3: 'Krama Alus'}
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Map textual labels to numerical labels
data['numerical_label'] = data['label']

# Analyze text data for each label
results = []
for numerical_label, text_label in label_mapping.items():
    group = data[data['numerical_label'] == str(numerical_label)]
    sentences = group['sentence'].dropna().tolist()
    if sentences:  # Ensure there are sentences to analyze
        analysis = analyze_text(sentences)
        analysis['Numerical Label'] = numerical_label
        analysis['Text Label'] = text_label
        results.append(analysis)

# Perform analysis on the entire dataset
all_sentences = data['sentence'].dropna().tolist()
total_analysis = analyze_text(all_sentences)
total_analysis['Numerical Label'] = 'Total'
total_analysis['Text Label'] = 'Total'

# Append the total analysis to the results
results.append(total_analysis)

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)

# Reorder columns to match the format in the image
results_df = results_df[['Text Label', 'Number of Sentences', 'Average Sentence Length', 'Number of Word Tokens', 'Number of Word Types', 'Yule’s Characteristic K']]

# Format the numerical columns to two decimal places
results_df['Average Sentence Length'] = results_df['Average Sentence Length'].apply(lambda x: f"{x:.2f}")
results_df['Yule’s Characteristic K'] = results_df['Yule’s Characteristic K'].apply(lambda x: f"{x:.2f}")


In [5]:
results_df

Unnamed: 0,Text Label,Number of Sentences,Average Sentence Length,Number of Word Tokens,Number of Word Types,Yule’s Characteristic K
0,Ngoko,1419,9.26,13142,3486,118.8
1,Ngoko Alus,590,10.07,5944,1527,108.13
2,Krama,1414,9.6,13572,3280,124.61
3,Krama Alus,601,10.13,6088,1530,115.14
4,Total,4024,9.63,38746,6156,105.43


In [17]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter
from lexical_diversity import lex_div as ld

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Function to analyze text data for a specific label
def analyze_text(sentences):
    tokenized_sentences = [word_tokenize(sent) for sent in sentences]
    words = [word for sentence in tokenized_sentences for word in sentence]

    num_sentences = len(sentences)
    avg_sentence_length = np.mean([len(sentence) for sentence in tokenized_sentences])
    num_word_tokens = len(words)
    num_word_types = len(set(words))

    word_counts = Counter(words)
    V_m = Counter(word_counts.values())
    K = 10**4 * (sum(m**2 * V_m[m] for m in V_m) - num_word_tokens) / num_word_tokens**2

    # Calculate MATTR, MSTTR, and MTLD
    mattr = ld.mattr(words)
    msttr = ld.msttr(words)
    mtld = ld.mtld(words)

    return {
        'Number of Sentences': num_sentences,
        'Average Sentence Length': avg_sentence_length,
        'Number of Word Tokens': num_word_tokens,
        'Number of Word Types': num_word_types,
        'Yule’s Characteristic K': K,
        'MATTR': mattr,
        'MSTTR': msttr,
        'MTLD': mtld
    }

# Function to analyze and display results for a given dataset
def analyze_dataset(data, label_mapping):
    data['label'] = data['label'].astype(str)

    results = []
    mattr_values = []
    msttr_values = []
    mtld_values = []
    for numerical_label, text_label in label_mapping.items():
        group = data[data['label'] == str(numerical_label)]
        sentences = group['sentence'].dropna().tolist()
        if sentences:  # Ensure there are sentences to analyze
            analysis = analyze_text(sentences)
            analysis['Numerical Label'] = numerical_label
            analysis['Text Label'] = text_label
            results.append(analysis)
            mattr_values.append(analysis['MATTR'])
            msttr_values.append(analysis['MSTTR'])
            mtld_values.append(analysis['MTLD'])

    # Perform analysis on the entire dataset
    all_sentences = data['sentence'].dropna().tolist()
    total_analysis = analyze_text(all_sentences)
    total_analysis['Numerical Label'] = 'Total'
    total_analysis['Text Label'] = 'Total'
    total_analysis['MATTR'] = np.mean(mattr_values)  # Calculate total MATTR as the average of all labels
    total_analysis['MSTTR'] = np.mean(msttr_values)  # Calculate total MSTTR as the average of all labels
    total_analysis['MTLD'] = np.mean(mtld_values)  # Calculate total MTLD as the average of all labels

    # Append the total analysis to the results
    results.append(total_analysis)

    # Convert results to DataFrame and return
    results_df = pd.DataFrame(results)

    # Reorder columns to match the format in the image
    results_df = results_df[['Text Label', 'Number of Sentences', 'Average Sentence Length', 'Number of Word Tokens', 'Number of Word Types', 'Yule’s Characteristic K', 'MATTR', 'MSTTR', 'MTLD']]

    # Format the numerical columns to two decimal places
    for col in ['Average Sentence Length', 'Yule’s Characteristic K', 'MATTR', 'MSTTR', 'MTLD']:
        results_df[col] = results_df[col].astype(float).round(2)

    return results_df

# Label mapping based on the table (assuming labels 0, 1, 2, 3 match with ngoko, ngoko alus, krama, krama alus)
label_mapping = {0: 'Ngoko', 1: 'Ngoko Alus', 2: 'Krama', 3: 'Krama Alus'}

# Analyze and display results for in-domain dataset (df_all_group3.csv)
results_df_all_group3 = analyze_dataset(load_data('df_all_group3.csv'), label_mapping)
print("In-Domain Results for df_all_group3.csv:")
display(results_df_all_group3)

# Combine and analyze out-of-domain datasets (news.csv and magz.csv)
out_of_domain_data = pd.concat([load_data('news.csv'), load_data('magz.csv')], ignore_index=True)

# Analyze out-of-domain dataset
out_of_domain_results = analyze_dataset(out_of_domain_data, label_mapping)
print("Out-of-Domain Results for news.csv and magz.csv:")
display(out_of_domain_results)

In-Domain Results for df_all_group3.csv:


Unnamed: 0,Text Label,Number of Sentences,Average Sentence Length,Number of Word Tokens,Number of Word Types,Yule’s Characteristic K,MATTR,MSTTR,MTLD
0,Ngoko,1419,9.26,13142,3486,118.8,0.81,0.81,124.26
1,Ngoko Alus,590,10.07,5944,1527,108.13,0.81,0.81,106.08
2,Krama,1414,9.6,13572,3280,124.61,0.8,0.79,93.65
3,Krama Alus,601,10.13,6088,1530,115.14,0.8,0.8,94.81
4,Total,4024,9.63,38746,6156,105.43,0.8,0.8,104.7


Out-of-Domain Results for news.csv and magz.csv:


Unnamed: 0,Text Label,Number of Sentences,Average Sentence Length,Number of Word Tokens,Number of Word Types,Yule’s Characteristic K,MATTR,MSTTR,MTLD
0,Ngoko,885,15.86,14033,3402,98.79,0.79,0.79,83.72
1,Krama Alus,369,15.48,5712,1824,110.34,0.77,0.77,71.17
2,Total,1254,15.75,19745,4916,94.45,0.78,0.78,77.45
