In [1]:
import pandas as pd
import os

# Set the path to the CSV file
csv_path = '../data/submisissions_cik_accn_list.csv'

# Read the CSV file
print("Reading CSV file...")
df = pd.read_csv(csv_path)

print(f"Original data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 5 rows:")
print(df.head())


Reading CSV file...
Original data shape: (1706075, 2)
Columns: ['cik', 'accession_number']

First 5 rows:
    cik      accession_number
0  1800  0000000000-05-060011
1  1800  0000000000-06-001379
2  1800  0000000000-07-023730
3  1800  0000000000-07-037625
4  1800  0000000000-08-022254


In [2]:
# Clean the data
print("Cleaning data...")

# Create a copy to work with
df_cleaned = df.copy()

# Assuming CIK column name (check the actual column name from above)
cik_column = df_cleaned.columns[0]  # Assuming first column is CIK
accn_column = df_cleaned.columns[1]  # Assuming second column is accession number

print(f"CIK column: {cik_column}")
print(f"Accession column: {accn_column}")

# Add CIK prefix to CIK values
df_cleaned[cik_column] = 'CIK' + df_cleaned[cik_column].astype(str).str.zfill(10)

print("\nCleaned data sample:")
print(df_cleaned.head(10))
print(f"\nCleaned data shape: {df_cleaned.shape}")


Cleaning data...
CIK column: cik
Accession column: accession_number

Cleaned data sample:
             cik      accession_number
0  CIK0000001800  0000000000-05-060011
1  CIK0000001800  0000000000-06-001379
2  CIK0000001800  0000000000-07-023730
3  CIK0000001800  0000000000-07-037625
4  CIK0000001800  0000000000-08-022254
5  CIK0000001800  0000000000-08-024745
6  CIK0000001800  0000000000-10-022377
7  CIK0000001800  0000000000-10-035176
8  CIK0000001800  0000000000-10-042728
9  CIK0000001800  0000000000-10-045851

Cleaned data shape: (1706075, 2)


In [3]:
# Export the cleaned data
output_file = "../data/submisissions_cik_accn_list_cleaned.csv"
df_cleaned.to_csv(output_file, index=False)
print(f"Cleaned data exported to: {output_file}")
print(f"Exported {len(df_cleaned):,} records")


Cleaned data exported to: ../data/submisissions_cik_accn_list_cleaned.csv
Exported 1,706,075 records


In [20]:
import gc
# Export only CIK 1800 data
cik_1800_data = df_cleaned[df_cleaned[cik_column] == 'CIK0000005272'].copy()
output_file_cik1800 = "../data/test_CIK0000005272.csv"
cik_1800_data.to_csv(output_file_cik1800, index=False)
print(f"CIK 1800 data exported to: {output_file_cik1800}")
print(f"Exported {len(cik_1800_data):,} records for CIK 1800")
print("\nSample CIK 1800 data:")
print(cik_1800_data.head())
del cik_1800_data   # delete the variable reference
gc.collect() 


CIK 1800 data exported to: ../data/test_CIK0000005272.csv
Exported 7,103 records for CIK 1800

Sample CIK 1800 data:
                 cik      accession_number
23993  CIK0000005272  0000000000-06-017230
23994  CIK0000005272  0000000000-07-058612
23995  CIK0000005272  0000000000-07-061228
23996  CIK0000005272  0000000000-08-004453
23997  CIK0000005272  0000000000-08-030919


0

In [5]:
# List all unique CIK values
print("All unique CIK values in the dataset:")
print("=" * 50)

unique_ciks = df_cleaned['cik'].unique()
print(f"Total unique CIKs: {len(unique_ciks):,}")
print()

# Show all CIKs
for i, cik in enumerate(sorted(unique_ciks), 1):
    print(f"{i:3d}. {cik}")

print()
print("CIK counts:")
cik_counts = df_cleaned['cik'].value_counts()
print(cik_counts)


All unique CIK values in the dataset:
Total unique CIKs: 500

  1. CIK0000001800
  2. CIK0000002488
  3. CIK0000002969
  4. CIK0000004127
  5. CIK0000004281
  6. CIK0000004904
  7. CIK0000004962
  8. CIK0000004977
  9. CIK0000005272
 10. CIK0000006281
 11. CIK0000006951
 12. CIK0000007084
 13. CIK0000008670
 14. CIK0000008818
 15. CIK0000009389
 16. CIK0000010456
 17. CIK0000010795
 18. CIK0000011544
 19. CIK0000012927
 20. CIK0000014272
 21. CIK0000014693
 22. CIK0000016732
 23. CIK0000016918
 24. CIK0000018230
 25. CIK0000019617
 26. CIK0000020286
 27. CIK0000021076
 28. CIK0000021344
 29. CIK0000021665
 30. CIK0000023217
 31. CIK0000024545
 32. CIK0000024741
 33. CIK0000026172
 34. CIK0000027419
 35. CIK0000027904
 36. CIK0000029534
 37. CIK0000029905
 38. CIK0000029989
 39. CIK0000031462
 40. CIK0000031791
 41. CIK0000032604
 42. CIK0000033185
 43. CIK0000033213
 44. CIK0000034088
 45. CIK0000034903
 46. CIK0000035527
 47. CIK0000036104
 48. CIK0000036270
 49. CIK0000037996
 50. CI

In [21]:
# Re-export per-remaining CIK with corrected path ../data/submissionCIK
import os

# Ensure exclude_ciks and df_cleaned are defined from previous cells
try:
    exclude_ciks
except NameError:
    exclude_ciks = set()

# Correct output directory
submission_cik_dir = '../data/submissionCIK'
os.makedirs(submission_cik_dir, exist_ok=True)

# Determine remaining CIKs and export
all_ciks = set(df_cleaned[cik_column].unique())
remaining_ciks = sorted(all_ciks - exclude_ciks)
print(f"To export: {len(remaining_ciks):,} CIKs -> {submission_cik_dir}")

exported = 0
for cik in remaining_ciks:
    cik_df = df_cleaned[df_cleaned[cik_column] == cik]
    if cik_df.empty:
        continue
    out_path = os.path.join(submission_cik_dir, f"test_{cik}.csv")
    cik_df.to_csv(out_path, index=False)
    exported += 1
    if exported % 25 == 0:
        print(f"Exported {exported} files so far...")

print(f"Done. Exported {exported} files to: {submission_cik_dir}")


To export: 500 CIKs -> ../data/submissionCIK
Exported 25 files so far...
Exported 50 files so far...
Exported 75 files so far...
Exported 100 files so far...
Exported 125 files so far...
Exported 150 files so far...
Exported 175 files so far...
Exported 200 files so far...
Exported 225 files so far...
Exported 250 files so far...
Exported 275 files so far...
Exported 300 files so far...
Exported 325 files so far...
Exported 350 files so far...
Exported 375 files so far...
Exported 400 files so far...
Exported 425 files so far...
Exported 450 files so far...
Exported 475 files so far...
Exported 500 files so far...
Done. Exported 500 files to: ../data/submissionCIK
