In [1]:
import sys
print(sys.executable)
print(sys.version)

D:\Anaconda\python.exe
3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]


In [2]:
import os
import re
import pandas as pd

def tokenize_and_save(input_folder, output_folder, input_month):
    # List all files that match from the input folder
    all_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.xls') and f'cyberlib_raw_{input_month}' in f]

    # Print found files
    print(f"Detected files: {all_files}")

    # Initialize a list to collect all DataFrames
    list_of_dfs = []

    def tokenize_content(text):
        sentences = re.split(r'(?<=[.!?])\s+', str(text))
        return sentences

    for file_path in all_files:
        print(f"Processing: {file_path}")
        try:
            try:
                # First, try reading as true Excel
                df = pd.read_excel(file_path, engine='xlrd')
                print("Read as Excel.")
            except Exception as e:
                print(f"⚠️ Excel read failed: {e}. Trying HTML read...")
                # If fail, try reading as HTML table
                html_tables = pd.read_html(file_path)
                if len(html_tables) == 0:
                    print(f"No table found in {file_path}, skipping.")
                    continue
                df = html_tables[0]
                print("Read as HTML.")

            if 'Content' not in df.columns:
                print(f"'Content' column not found in {file_path}, skipping.")
                continue

            # Tokenize
            df['Sentences'] = df['Content'].apply(tokenize_content)
            df = df.drop(columns=['Content'])
            df_exploded = df.explode('Sentences').reset_index(drop=True)

            list_of_dfs.append(df_exploded)

        except Exception as e:
            print(f"⚠️ Failed to process {file_path}: {e}")
            continue

    # === Save the combined result ===
    if list_of_dfs:
        combined_df = pd.concat(list_of_dfs, ignore_index=True)

        os.makedirs(output_folder, exist_ok=True)
        output_file = os.path.join(output_folder, f'tokenized_{input_month.replace('-', '_')}.xlsx')
        #combined_df.to_excel(output_file, index=False)
        output_file = output_file.replace('.xlsx', '.csv')
        combined_df.to_csv(output_file, index=False)

        print(f"✅ Tokenized combined file saved successfully to: {output_file}")
    else:
        print(f"⚠️ No data to save for {input_month}.")

In [4]:
# Example usage:
input_folder = os.path.join('input', 'cyberlib', '2020')
output_folder = os.path.join('output', '2020')
input_month = '2020-02'
tokenize_and_save(input_folder, output_folder, input_month)

Detected files: ['input\\cyberlib\\2020\\cyberlib_raw_2020-02-01.xls', 'input\\cyberlib\\2020\\cyberlib_raw_2020-02-18.xls']
Processing: input\cyberlib\2020\cyberlib_raw_2020-02-01.xls
⚠️ Excel read failed: Unsupported format, or corrupt file: Expected BOF record; found b'<table b'. Trying HTML read...
Read as HTML.
Processing: input\cyberlib\2020\cyberlib_raw_2020-02-18.xls
⚠️ Excel read failed: Unsupported format, or corrupt file: Expected BOF record; found b'<table b'. Trying HTML read...
Read as HTML.
✅ Tokenized combined file saved successfully to: output\2020\tokenized_2020_02.csv
