In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Load the dataset
data = load_data('df_all_group3.csv')

# Extract the 'sentence' column
sentences = data['sentence'].dropna().tolist()

# Tokenize each sentence into words
words = [word_tokenize(sent) for sent in sentences]
words = [word for sublist in words for word in sublist]

# Calculate the number of sentences
num_sentences = len(sentences)

# Calculate the average sentence length
avg_sentence_length = np.mean([len(word_tokenize(sent)) for sent in sentences])

# Calculate the number of word tokens
num_word_tokens = len(words)

# Calculate the number of word types
num_word_types = len(set(words))

# Calculate Yule’s characteristic K
word_counts = Counter(words)
V_m = Counter(word_counts.values())
K = 10**4 * (sum(m**2 * V_m[m] for m in V_m) - num_word_tokens) / num_word_tokens**2

# Display results
results = {
    'Number of Sentences': num_sentences,
    'Average Sentence Length': avg_sentence_length,
    'Number of Word Tokens': num_word_tokens,
    'Number of Word Types': num_word_types,
    'Yule’s Characteristic K': K
}

# Print results as a DataFrame
results_df = pd.DataFrame([results])
print(results_df)


In [None]:
results_df

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Function to analyze text data for a specific label
def analyze_text(sentences):
    words = [word_tokenize(sent) for sent in sentences]
    words = [word for sublist in words for word in sublist]

    num_sentences = len(sentences)
    avg_sentence_length = np.mean([len(word_tokenize(sent)) for sent in sentences])
    num_word_tokens = len(words)
    num_word_types = len(set(words))

    word_counts = Counter(words)
    V_m = Counter(word_counts.values())
    K = 10**4 * (sum(m**2 * V_m[m] for m in V_m) - num_word_tokens) / num_word_tokens**2

    return {
        'Number of Sentences': num_sentences,
        'Average Sentence Length': avg_sentence_length,
        'Number of Word Tokens': num_word_tokens,
        'Number of Word Types': num_word_types,
        'Yule’s Characteristic K': K
    }

# Load the dataset
data = load_data('df_all_group3.csv')

# # Ensure 'label' column is treated as string
data['label'] = data['label'].astype(str)

# # Label mapping
label_mapping = {'n': 0, 'na': 1, 'k': 2, 'ka': 3}
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Map textual labels to numerical labels
# data['numerical_label'] = data['class'].map(label_mapping)
data['numerical_label'] = data['label']

# Analyze text data for each label
results = []
for numerical_label, text_label in inverse_label_mapping.items():
    group = data[data['numerical_label'] == numerical_label]
    sentences = group['sentence'].dropna().tolist()
    if sentences:  # Ensure there are sentences to analyze
        analysis = analyze_text(sentences)
        analysis['Numerical Label'] = numerical_label
        analysis['Text Label'] = text_label
        results.append(analysis)

# Perform analysis on the entire dataset
all_sentences = data['sentence'].dropna().tolist()
total_analysis = analyze_text(all_sentences)
total_analysis['Numerical Label'] = 'Total'
total_analysis['Text Label'] = 'Total'

# Append the total analysis to the results
results.append(total_analysis)

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)

# Reorder columns to have 'Numerical Label' and 'Text Label' first
results_df = results_df[['Numerical Label', 'Text Label', 'Number of Sentences', 'Average Sentence Length', 'Number of Word Tokens', 'Number of Word Types', 'Yule’s Characteristic K']]

print(results_df)


In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from collections import Counter

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Function to analyze text data for a specific label
def analyze_text(sentences):
    words = [word_tokenize(sent) for sent in sentences]
    words = [word for sublist in words for word in sublist]

    num_sentences = len(sentences)
    avg_sentence_length = np.mean([len(word_tokenize(sent)) for sent in sentences])
    num_word_tokens = len(words)
    num_word_types = len(set(words))

    word_counts = Counter(words)
    V_m = Counter(word_counts.values())
    K = 10**4 * (sum(m**2 * V_m[m] for m in V_m) - num_word_tokens) / num_word_tokens**2

    return {
        'Number of Sentences': num_sentences,
        'Average Sentence Length': avg_sentence_length,
        'Number of Word Tokens': num_word_tokens,
        'Number of Word Types': num_word_types,
        'Yule’s Characteristic K': K
    }

# Load the dataset
data = load_data('df_all_group3.csv')

# Ensure 'label' column is treated as string
data['label'] = data['label'].astype(str)

# Label mapping based on the table (assuming labels 0, 1, 2, 3 match with ngoko, ngoko alus, krama, krama alus)
label_mapping = {0: 'Ngoko', 1: 'Ngoko Alus', 2: 'Krama', 3: 'Krama Alus'}
inverse_label_mapping = {v: k for k, v in label_mapping.items()}

# Map textual labels to numerical labels
data['numerical_label'] = data['label']

# Analyze text data for each label
results = []
for numerical_label, text_label in label_mapping.items():
    group = data[data['numerical_label'] == str(numerical_label)]
    sentences = group['sentence'].dropna().tolist()
    if sentences:  # Ensure there are sentences to analyze
        analysis = analyze_text(sentences)
        analysis['Numerical Label'] = numerical_label
        analysis['Text Label'] = text_label
        results.append(analysis)

# Perform analysis on the entire dataset
all_sentences = data['sentence'].dropna().tolist()
total_analysis = analyze_text(all_sentences)
total_analysis['Numerical Label'] = 'Total'
total_analysis['Text Label'] = 'Total'

# Append the total analysis to the results
results.append(total_analysis)

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)

# Reorder columns to match the format in the image
results_df = results_df[['Text Label', 'Number of Sentences', 'Average Sentence Length', 'Number of Word Tokens', 'Number of Word Types', 'Yule’s Characteristic K']]

# Format the numerical columns to two decimal places
results_df['Average Sentence Length'] = results_df['Average Sentence Length'].apply(lambda x: f"{x:.2f}")
results_df['Yule’s Characteristic K'] = results_df['Yule’s Characteristic K'].apply(lambda x: f"{x:.2f}")
