# save the ten year and download on google drive:

In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from google.colab import drive
from datasets import load_dataset, concatenate_datasets

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
path = "/content/drive/MyDrive/"
print(os.listdir(path))  # List all files & folders in MyDrive

In [None]:
load_path = "/content/drive/MyDrive/edgar_corpus_filtered"

files = os.listdir(load_path)
print("Files in dataset folder:", files)

# Check if Arrow files have data
for file in files:
    file_path = os.path.join(load_path, file)
    print(f"{file}: {os.path.getsize(file_path)} bytes")

In [None]:
from datasets import load_from_disk

dataset = load_from_disk(load_path)

# Convert to Pandas
df = dataset.to_pandas()

print(df.head())
print(f"Dataset contains {len(df)} rows")

Loading dataset from disk:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# Check basic info
print(df.info())

# View sample rows
print(df.head())

# Check for missing values
print(df.isna().sum())

In [None]:
import re
import pandas as pd
import csv

# Keep only the required columns
if "section_8" not in df.columns:
    raise ValueError("Column 'section_8' not found. Check available columns above.")

df = df[["filename", "section_8"]].copy()

# Remove '.txt' from all columns
df = df.applymap(lambda x: str(x).replace(".txt", "").strip() if isinstance(x, str) else x)

# Remove newlines from 'section_8'
df["section_8"] = df["section_8"].astype(str).replace("\n", " ", regex=True)

# Rename column
df.rename(columns={"section_8": "Input text"}, inplace=True)

# Drop 'filename' column
df.drop(columns=["filename"], errors="ignore", inplace=True)

# Remove NaN values and empty strings
df.dropna(how="any", inplace=True)
df = df[df["Input text"].apply(lambda x: str(x).strip().lower() != 'nan')]

# Define phrases to remove
phrases_to_remove = [
    "Financial Statements and Supplementary Data",
    "Report of Independent Public Accountants",
    "Report of Independent Accountants",
    "Notes to Consolidated Financial Statements",
    "See accompanying Notes to Consolidated Financial Statements",
    "The information required by this item is incorporated herein by reference",
    "Filed herein by the Registrant",
    "The financial statements of Registrant are attached hereto as Exhibit.",
    "Reference is made to the Registrant's Annual Report to Shareholders.",
    "Filed herein by the Registrant with the Commission pursuant to Regulation",
    "All other schedules are omitted because they are not applicable or the required information is shown in the Consolidated Financial Statements or the Notes thereto.",
    "See accompanying Notes to Consolidated Financial Statements.",
    "Amounts are summarized as follows:"
]

# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return ""

    # Remove exact phrases
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')

    # Remove occurrences of "Item X." where X is any number
    text = re.sub(r'\bItem\s+\d+\.\s*', '', text, flags=re.IGNORECASE)

    # Remove multiple dots (e.g., ..., . . . . .)
    text = re.sub(r'\.{2,}', ' ', text)

    # Remove multiple dashes (e.g., --, ----)
    text = re.sub(r'[-–]{2,}', ' ', text)

    # Remove lines full of dots, dashes, or mixed symbols
    text = re.sub(r'^[\s\.\-=_]+$', '', text, flags=re.MULTILINE)

    # Remove "-- The" or similar unnecessary leading dashes with words
    text = re.sub(r'^\s*[-–]+ The', 'The', text, flags=re.MULTILINE)

    # Remove short sentences (less than 2000 characters)
    text = " ".join([sentence.strip() for sentence in text.split("\n") if len(sentence.strip()) >= 2000])

    # Remove extra blank lines
    text = re.sub(r'\n\s*\n', '\n', text)

    return text.strip()

# Apply text cleaning
df["cleaned_input"] = df["Input text"].apply(clean_text)

# Drop original column
df.drop(columns=["Input text"], errors="ignore", inplace=True)

# Remove rows where "cleaned_input" is empty or NaN
df = df[df["cleaned_input"].notna() & (df["cleaned_input"] != '')]

# Save cleaned dataset as CSV
df.to_csv("clean_edgar_corpus_from_arrow.csv", index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

print(f"Cleaning complete! Saved {len(df)} rows to 'clean_edgar_corpus_from_arrow.csv'")


In [None]:
df.head()

In [None]:
import os
import csv

# Define save path inside Google Drive
csv_save_path = "/content/drive/MyDrive/edgar_corpus_filtered/clean_edgar_corpus_from_arrow.csv"

# Ensure the directory exists
os.makedirs(os.path.dirname(csv_save_path), exist_ok=True)

# Save DataFrame as CSV
df.to_csv(csv_save_path, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

print(f"Cleaning complete! Saved {len(df)} rows to '{csv_save_path}'")


In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from google.colab import drive
from datasets import load_dataset, concatenate_datasets

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
path = "/content/drive/MyDrive/edgar_corpus_filtered"
print(os.listdir(path))

['data-00000-of-00030.arrow', 'data-00001-of-00030.arrow', 'data-00002-of-00030.arrow', 'data-00003-of-00030.arrow', 'data-00004-of-00030.arrow', 'data-00005-of-00030.arrow', 'data-00006-of-00030.arrow', 'data-00007-of-00030.arrow', 'data-00008-of-00030.arrow', 'data-00009-of-00030.arrow', 'data-00010-of-00030.arrow', 'data-00011-of-00030.arrow', 'data-00012-of-00030.arrow', 'data-00013-of-00030.arrow', 'data-00014-of-00030.arrow', 'data-00015-of-00030.arrow', 'data-00016-of-00030.arrow', 'data-00017-of-00030.arrow', 'data-00018-of-00030.arrow', 'data-00019-of-00030.arrow', 'data-00020-of-00030.arrow', 'data-00021-of-00030.arrow', 'data-00022-of-00030.arrow', 'data-00023-of-00030.arrow', 'data-00024-of-00030.arrow', 'data-00025-of-00030.arrow', 'data-00026-of-00030.arrow', 'data-00027-of-00030.arrow', 'data-00028-of-00030.arrow', 'data-00029-of-00030.arrow', 'state.json', 'dataset_info.json', 'clean_edgar_corpus_from_arrow.csv']


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/edgar_corpus_filtered/clean_edgar_corpus_from_arrow.csv")

In [None]:
df.head()

Unnamed: 0,cleaned_input
0,FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA Le...
1,FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA Co...
2,Index to Consolidated Financial Statements Rep...
3,FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA Fi...
4,"FINANCIAL STATEMENTS Our financial statements,..."


In [None]:
# Define the text to remove
text_to_remove = "FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA"

df['cleaned_input'] = df['cleaned_input'].str.replace(text_to_remove, "")

In [None]:
# Rename the 'cleaned_input' column to 'input'
df.rename(columns={'cleaned_input': 'input'}, inplace=True)

In [None]:
len(df)

32891

it it too much, we have to pay a lot of credit let's select only 10000 row from the buttom (more 2019 is at the below so this way wil help us get more recently)first

# first try:

using last because more recently dataset:

In [None]:
# Select the last 10,000 rows
last_10000_rows = df.tail(10000)

# Print the selected rows
print(last_10000_rows)

                                                   input
22891   INDEX TO CONSOLIDATED FINANCIAL STATEMENTS RE...
22892   REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOU...
22893  . REPORT OF INDEPENDENT REGISTERED PUBLIC ACCO...
22894  . Report of Independent Registered Public Acco...
22895  . MGE Energy Management's Report on Internal C...
...                                                  ...
32886   Pismo Coast Village, Inc. is responsible for ...
32887   CURAEGIS TECHNOLOGIES, INC. Contents Financia...
32888  The following financial information is include...
32889   The Company’s unaudited quarterly results for...
32890   REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOU...

[10000 rows x 1 columns]


to handle the overfit cause

In [None]:
# Define the texts to remove
texts_to_remove = ["FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA", "item8", "ACCOUNTING FIRM"]

# Remove the specified texts from the 'input' column
for text in texts_to_remove:
    df['input'] = df['input'].str.replace(text, "")

In [None]:
# Select the last 10,000 rows
last_10000_rows = df.tail(10000)

# Print the selected rows (optional)
print(last_10000_rows)

                                                   input
22891   INDEX TO CONSOLIDATED FINANCIAL STATEMENTS RE...
22892   REPORT OF INDEPENDENT REGISTERED PUBLIC  To t...
22893  . REPORT OF INDEPENDENT REGISTERED PUBLIC  To ...
22894  . Report of Independent Registered Public Acco...
22895  . MGE Energy Management's Report on Internal C...
...                                                  ...
32886   Pismo Coast Village, Inc. is responsible for ...
32887   CURAEGIS TECHNOLOGIES, INC. Contents Financia...
32888  The following financial information is include...
32889   The Company’s unaudited quarterly results for...
32890   REPORT OF INDEPENDENT REGISTERED PUBLIC  To t...

[10000 rows x 1 columns]


In [None]:
# Define the text to remove
text_to_remove = "REPORT OF INDEPENDENT REGISTERED PUBLIC"

df['input'] = df['input'].str.replace(text_to_remove, "")

In [None]:
# Select the last 10,000 rows
last_10000_rows = df.tail(10000)

# Print the selected rows
print(last_10000_rows)

                                                   input
22891   INDEX TO CONSOLIDATED FINANCIAL STATEMENTS   ...
22892     To the Board of Directors and Shareowners o...
22893  .   To the Board of Directors and Stockholders...
22894  . Report of Independent Registered Public Acco...
22895  . MGE Energy Management's Report on Internal C...
...                                                  ...
32886   Pismo Coast Village, Inc. is responsible for ...
32887   CURAEGIS TECHNOLOGIES, INC. Contents Financia...
32888  The following financial information is include...
32889   The Company’s unaudited quarterly results for...
32890     To the Board of Directors and Stockholders ...

[10000 rows x 1 columns]


In [None]:
df = last_10000_rows.copy()

In [None]:
new_folder_path = "/content/drive/MyDrive/edgar_corpus_processed"
os.makedirs(new_folder_path, exist_ok=True)  # Ensure directory exists

# Split, process, and join back to ensure consistency
df['input'] = df['input'].apply(lambda x: " ".join(x.split()))

# Display final result
print(df.head())

                                                   input
22891  INDEX TO CONSOLIDATED FINANCIAL STATEMENTS To ...
22892  To the Board of Directors and Shareowners of A...
22893  . To the Board of Directors and Stockholders o...
22894  . Report of Independent Registered Public Acco...
22895  . MGE Energy Management's Report on Internal C...


In [None]:
# Save the result to a new CSV file
new_csv_path = os.path.join(new_folder_path, "grouped_words.csv")
df.to_csv(new_csv_path, index=False)

print(f"The grouped words have been saved to {new_csv_path}")
df.head()

The grouped words have been saved to /content/drive/MyDrive/edgar_corpus_processed/grouped_words.csv


Unnamed: 0,input
22891,INDEX TO CONSOLIDATED FINANCIAL STATEMENTS To ...
22892,To the Board of Directors and Shareowners of A...
22893,. To the Board of Directors and Stockholders o...
22894,. Report of Independent Registered Public Acco...
22895,. MGE Energy Management's Report on Internal C...


# want more data due to first time that a lot of duplicates data there!!

In [None]:
print(df.head())

                                       cleaned_input
0  FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA Le...
1  FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA Co...
2  Index to Consolidated Financial Statements Rep...
3  FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA Fi...
4  FINANCIAL STATEMENTS Our financial statements,...


download and clean the same way:

In [None]:
import os
import re
import pandas as pd
import csv

# Load the dataset
file_path = "/content/drive/MyDrive/edgar_corpus_filtered/clean_edgar_corpus_from_arrow.csv"
df = pd.read_csv(file_path)

# Select the last 15,000 rows
last_15000_rows = df.tail(15000)

# Select only the last 5,000 rows from row index 5000 to 10,000
additional_5000_rows = last_15000_rows.iloc[5000:10000].copy()

# Cleaning process
# Remove unwanted text patterns
texts_to_remove = [
    "FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA",
    "item8",
    "ACCOUNTING FIRM",
    "REPORT OF INDEPENDENT REGISTERED PUBLIC"
]

for text in texts_to_remove:
    additional_5000_rows["input"] = additional_5000_rows["cleaned_input"].str.replace(text, "", regex=True)

# Function to clean text
def clean_text(text):
    if pd.isna(text):
        return ""

    phrases_to_remove = [
        "Financial Statements and Supplementary Data",
        "Report of Independent Public Accountants",
        "Report of Independent Accountants",
        "Notes to Consolidated Financial Statements",
        "See accompanying Notes to Consolidated Financial Statements",
        "The information required by this item is incorporated herein by reference",
        "Filed herein by the Registrant",
        "The financial statements of Registrant are attached hereto as Exhibit.",
        "Reference is made to the Registrant's Annual Report to Shareholders.",
        "Filed herein by the Registrant with the Commission pursuant to Regulation",
        "All other schedules are omitted because they are not applicable or the required information is shown in the Consolidated Financial Statements or the Notes thereto.",
        "See accompanying Notes to Consolidated Financial Statements.",
        "Amounts are summarized as follows:"
    ]

    # Remove exact phrases
    for phrase in phrases_to_remove:
        text = text.replace(phrase, "")

    # Remove occurrences of "Item X." where X is any number
    text = re.sub(r'\bItem\s+\d+\.\s*', '', text, flags=re.IGNORECASE)

    # Remove multiple dots (e.g., ..., . . . . .)
    text = re.sub(r'\.{2,}', ' ', text)

    # Remove multiple dashes (e.g., --, ----)
    text = re.sub(r'[-–]{2,}', ' ', text)

    # Remove lines full of dots, dashes, or mixed symbols
    text = re.sub(r'^[\s\.\-=_]+$', '', text, flags=re.MULTILINE)

    # Remove "-- The" or similar unnecessary leading dashes with words
    text = re.sub(r'^\s*[-–]+ The', 'The', text, flags=re.MULTILINE)

    # Remove short sentences (less than 2000 characters)
    text = " ".join([sentence.strip() for sentence in text.split("\n") if len(sentence.strip()) >= 2000])

    # Remove extra blank lines
    text = re.sub(r'\n\s*\n', '\n', text)

    return text.strip()

# Apply text cleaning
additional_5000_rows["cleaned_input"] = additional_5000_rows["input"].apply(clean_text)

# Drop original column and empty rows
additional_5000_rows.drop(columns=["input"], errors="ignore", inplace=True)
additional_5000_rows = additional_5000_rows[additional_5000_rows["cleaned_input"].notna() & (additional_5000_rows["cleaned_input"] != '')]

# Create a new folder for the processed dataset
new_folder_path = "/content/drive/MyDrive/edgar_corpus_processed"
os.makedirs(new_folder_path, exist_ok=True)

# Save the cleaned dataset
new_csv_path = os.path.join(new_folder_path, "extra_5000_cleaned_fixed.csv")
additional_5000_rows.to_csv(new_csv_path, index=False, encoding="utf-8", quoting=csv.QUOTE_ALL)

print(f"Processing complete! Saved {len(additional_5000_rows)} rows to '{new_csv_path}'")

Processing complete! Saved 5000 rows to '/content/drive/MyDrive/edgar_corpus_processed/extra_5000_cleaned_fixed.csv'
