In [2]:
# here we count tokens

import pandas as pd
from tokenizer.token_counter import TokenCounter
from PyPDF2 import PdfReader

# Initialize the token counter
counter = TokenCounter("gpt-3.5-turbo")

def count_tokens_from_file(file_path: str, text_column: str = None) -> int:
    """
    Count tokens in a file. Supports CSV, TXT, and PDF files.
    
    Args:
        file_path (str): Path to the file.
        text_column (str): Column name for text data (only for CSV files).
    
    Returns:
        int: Total token count in the file.
    """
    # Check file extension
    if file_path.endswith(".csv"):
        # Load CSV and extract the specified text column
        if text_column is None:
            raise ValueError("For CSV files, you must specify the 'text_column' argument.")
        df = pd.read_csv(file_path)
        if text_column not in df.columns:
            raise ValueError(f"Column '{text_column}' not found in the CSV file.")
        texts = df[text_column].dropna().tolist()
    elif file_path.endswith(".txt"):
        # Read text file
        with open(file_path, "r", encoding="utf-8") as f:
            texts = [line.strip() for line in f if line.strip()]
    elif file_path.endswith(".pdf"):
        # Read PDF file
        reader = PdfReader(file_path)
        texts = [page.extract_text() for page in reader.pages if page.extract_text()]
    else:
        raise ValueError("Unsupported file type. Only CSV, TXT, and PDF files are supported.")
    
    # Count tokens
    total_tokens = counter.total_tokens(texts)
    return total_tokens

# Example Usage
# file_path_csv = "/path/to/your/file.csv"
# file_path_txt = "/path/to/your/file.txt"
file_path_pdf = "/Users/haigbedros/Desktop/MSDS/Capstone/CODE/ml-models-information-filtering/src/Extending the Linear Model with R.pdf"

# For CSV files, specify the text column
# csv_token_count = count_tokens_from_file(file_path_csv, text_column="content")  # Replace 'content' with your column name
# print(f"Total tokens in CSV file: {csv_token_count}")

# For TXT files
# txt_token_count = count_tokens_from_file(file_path_txt)
# print(f"Total tokens in TXT file: {txt_token_count}")

# For PDF files
pdf_token_count = count_tokens_from_file(file_path_pdf)
print(f"Total tokens in PDF file: {pdf_token_count}")

Total tokens in PDF file: 173288


In [None]:
import pandas as pd
from transformers import AutoTokenizer

# Load the CSV file
file_path = "/Users/haigbedros/Desktop/MSDS/Capstone/CODE/ml-models-information-filtering/notebooks/lda_bert_processed_fake.csv"  # Replace with your file path
df = pd.read_csv(file_path)

# Load Hugging Face's tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Replace with your tokenizer

# Count tokens in the 'text' column
def count_tokens(text):
    if pd.isna(text):  # Handle missing values
        return 0
    return len(tokenizer.tokenize(text))

df['token_count'] = df['text'].apply(count_tokens)

# Combine token counts (sum them up)
total_tokens = df['token_count'].sum()

# Print the total token count
print(f"Total Tokens in the 'text' column: {total_tokens}")

  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: [Errno 2] No such file or directory: 'your_file.csv'