In [2]:
#pip install spacy 

Collecting spacy
  Downloading spacy-3.7.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp39-cp39-macosx_10_9_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp39-cp39-macosx_10_9_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp39-cp39-macosx_10_9_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp39-cp39-macosx_10_9_x86_64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import spacy
import numpy as np
from sklearn.manifold import MDS
import pandas as pd

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Path to the dataset folder
dataset_folder = 'dataset'

# List to store text data from all files
all_text_data = []

# Entity types of interest
entity_types = ['PERSON', 'GPE', 'DATE', 'ORG', 'CARDINAL']

# Loop through files in the dataset folder
for filename in os.listdir(dataset_folder):
    # Read the contents of the file
    with open(os.path.join(dataset_folder, filename), 'r', encoding='utf-8') as file:
        text_data = file.read()
        all_text_data.append((filename, text_data))  # Store filename along with text

# Initialize an empty DataFrame to store entity counts per document
entity_matrix = pd.DataFrame(0, index=[fname for fname, _ in all_text_data], columns=entity_types)

# Populate the DataFrame with entity counts
for filename, text in all_text_data:
    # Process the text using spaCy
    doc = nlp(text)
    # Count the entities of interest
    entity_counts = {etype: 0 for etype in entity_types}
    for ent in doc.ents:
        if ent.label_ in entity_types:
            entity_counts[ent.label_] += 1
    # Update the DataFrame
    entity_matrix.loc[filename] = [entity_counts[etype] for etype in entity_types]

# Apply Multidimensional Scaling
mds = MDS(n_components=2, random_state=42)
mds_coords = mds.fit_transform(entity_matrix.values)

# Create a DataFrame for the MDS results
mds_results = pd.DataFrame(mds_coords, index=entity_matrix.index, columns=['MDS1', 'MDS2'])
print(mds_results)

# Optionally, save the MDS results to a CSV file
output_csv = "mds_results.csv"
mds_results.to_csv(output_csv)
print(f"MDS results saved to {output_csv}")


                      MDS1       MDS2
CIA_01            9.850102  -0.667158
CIA_02            6.570594   4.167886
CIA_03            7.158469  -1.316559
CIA_04            9.430058  -0.146963
CIA_05           12.158756  -0.418011
...                    ...        ...
NSA_22            8.143254  -1.916215
NSA_docs.txt   -109.672427 -54.611715
USCBP_01          9.556821   6.860964
USCBP_02         11.384500   1.291512
USCBP_docs.txt    1.205559   8.379009

[116 rows x 2 columns]
MDS results saved to mds_results.csv


In [3]:
import pandas as pd

# Generate JSON file with index
mds_results.reset_index().to_json('mds_data.json', orient='records')