In [2]:
# DOCUENTS AS WHOLE PLANS
import os

# Function to read all text files in a folder


def read_text_files(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(
                os.path.join(folder_path, filename), "r", encoding="utf-8"
            ) as file:
                texts.append(file.read())
    return texts


# Example usage
folder_path = "files/txt"
texts = read_text_files(folder_path)

In [23]:
# DOCUENTS AS INDIVIDUAL POLICIES

from postgres import Postgres

pg = Postgres()

results = pg.query(
    """
 SELECT 
	cmetadata->'text' as text
    FROM langchain_pg_embedding 
	WHERE cmetadata @> '{"chunker": "sherpa"}' 
    AND LENGTH(cmetadata->>'sections'::TEXT) > 20
	LIMIT 5000;
                   
	"""
)
texts = [r[0] for r in results]

In [25]:
import spacy

import numpy as np

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")


def remove_place_names_and_stop_words(text):

    custom_phrases = [
        "Abberley",
        "Abbots Langley",
        "Aberford",
        "Ab Kettleby Parish",
        "Acle",
        "Acton",
        "Addingham",
        "Adel",
        "Alcester",
        "Alfold",
        "Cleobury Mortimer",
        "Corby Glen",
        "Cossington",
        "Elmswell",
        "Fulbourn",
        "Nether Whitacre",
        "Sawtry",
        "Strensall & Towthorpe",
        "The Three Parishes",
        "Totnes",
        "West Wittering",
        "Whaley Bridge",
        "Winchfield",
        "Winkfield",
    ]

    doc = nlp(text)
    # Remove place names and stop words
    filtered_tokens = [
        token.text for token in doc if token.ent_type_ != "GPE" and not token.is_stop
    ]

    # Reconstruct the filtered text
    filtered_text = " ".join(filtered_tokens)

    # Remove custom stop phrases (case insensitive)
    filtered_text_lower = filtered_text.lower()
    for phrase in custom_phrases:
        filtered_text_lower = filtered_text_lower.replace(phrase.lower(), "")

    # Split and join to remove extra spaces
    filtered_text = " ".join(filtered_text_lower.split())

    return filtered_text


processed_texts = [remove_place_names_and_stop_words(doc) for doc in texts]

In [26]:
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# Vectorize the text data
vectorizer = CountVectorizer(max_df=0.10, min_df=4, stop_words='english')
dtm = vectorizer.fit_transform(processed_texts)

# Apply LDA
n_topics = 20  # Number of topics
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(dtm)

# Get the topics and words


def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [
            feature_names[i] for i in topic.argsort()[: -no_top_words - 1: -1]
        ]
        topics[f"Topic {topic_idx}"] = top_words
    return topics


no_top_words = 10
topics = display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

# Display the topics
for topic, words in topics.items():
    print(f"{topic}: {', '.join(words)}")

Topic 0: lane, street, houses, church, north, photo, strensall, station, brick, parking
Topic 1: parking, energy, amenity, scale, street, car, charging, impact, residential, electric
Topic 2: town, transport, strategy, core, safe, peak, routes, way, network, improve
Topic 3: employment, services, business, businesses, uses, opportunities, retail, health, town, economy
Topic 4: environmental, water, walsham, flood, le, willows, risk, assessment, flooding, change
Topic 5: school, canal, figure, primary, street, footpath, access, sinc, footpaths, network
Topic 6: biodiversity, wildlife, trees, woodland, species, habitats, planting, nature, native, hedgerows
Topic 7: uk, www, https, gov, np, org, authority, huntingdonshire, applications, pdf
Topic 8: sports, walking, transport, club, leisure, facility, recreation, bus, cycling, hall
Topic 9: century, west, house, ii, brick, church, north, east, roof, windows
Topic 10: boundary, detached, significance, century, defined, building, photo, gen