In [4]:
# import
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter



In [6]:
# load the document and split it into chunks
loader = TextLoader("ccc.txt")
documents = loader.load()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

# query it
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)

# print results
print(docs[0].page_content)

Created a chunk of size 1117, which is longer than the specified 1000
Created a chunk of size 1117, which is longer than the specified 1000


Justice Department Secures Agreement with Climate Nonprofit to Resolve Claims of Employment Discrimination. The Justice Department announced today that it secured a settlement agreement with Second Nature, a non-profit organization based in Massachusetts. The agreement resolves the departmentâ€™s determination that Second Nature violated the Immigration and Nationality Act (INA) by posting discriminatory job advertisements that deterred non-U.S. citizens from applying for open positions.


## NER 1

In [14]:
import datetime
from spacy.matcher import Matcher
from spacy.tokens import Doc

# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy

# # Sample Data (Replace with your actual data)
# data = [
#     {"id": 1, "summary": "Company X fined for environmental violations", "date": datetime(2023, 1, 1)},
#     {"id": 2, "summary": "New regulations proposed for anti-bribery compliance by government Y", "date": datetime(2023, 2, 15)},
#     {"id": 3, "summary": "Allegations of fraud surface against CEO John Doe of company Z", "date": datetime(2023, 3, 10)},
#     # ... more data entries
# ]

# # Convert data to Pandas DataFrame
# df = pd.DataFrame(data)



In [26]:
excel_file_path = 'summary1.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(excel_file_path)


# Define time period (replace with your desired start and end dates)
# start_date = datetime(2023, 1, 1)
# end_date = datetime(2023, 6, 30)

# # Filter data for the time period
# df_filtered = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
df_filtered['summary'] = df['summary']


In [27]:
df_filtered

0           SEC Charges Lordstown Motors with Misleading I...
1           SEC Investor Advisory Committee to Discuss Pro...
2           Federal Court Orders Unregistered Pool Operato...
3           Federal Court Orders California-Based Precious...
4           CFTC Approves Final Rules on Swap Confirmation...
                                  ...                        
231         Readout of Justice Department’s Civil Rights D...
232         Elara Caring Agrees to Pay $4.2 Million to Set...
233         Scheme to Transfer Money to Iran Results in Gu...
summary     0      SEC Charges Lordstown Motors with Misle...
summary1    0      SEC Charges Lordstown Motors with Misle...
Name: summary, Length: 236, dtype: object

In [37]:



def preprocess_text(text):
  """
  Preprocess text with spaCy for NER (replace with your desired cleaning steps)
  """
  nlp = spacy.load("en_core_web_sm")  # Load spaCy model
  doc = nlp(text)

  # Define matcher for named entities (organizations, people)
  matcher = Matcher(nlp.vocab)
  org_pattern = [{'POS': 'PROPN'}, {'LEMMA': 'inc'}, {'LEMMA': 'ltd'}, {'LEMMA': 'llc'}]
  person_pattern = [{'POS': 'PROPN'}]
  matcher.add("ORG", org_pattern)  # Remove unnecessary None argument
  matcher.add("PERSON", person_pattern)  # Remove unnecessary None argument

  doc.ents = [ent for ent in doc.ents if ent.label_ in ("ORG", "PERSON")]  # Extract relevant entities
  cleaned_text = doc.text_.lower()  # Lowercase text

  return cleaned_text, [ent.text for ent in doc.ents]  # Return cleaned text and extracted entities


# Apply preprocessing to summaries and extract entities
# df["summary"], df["entities"] = zip(*df["summary"].apply(preprocess_text))
# Apply preprocessing with a temporary DataFrame
# temp_df = df[["summary"]].apply(preprocess_text, axis=1, result_type='expand')
temp_df = df[["summary"]].apply(lambda x: preprocess_text(x.iloc[0]), axis=1, result_type='expand')


# Assign results back to original DataFrame
df["summary"] = temp_df[0]
df["entities"] = temp_df[1]

# Define number of topics
num_topics = 5  # Adjust number of topics as needed

# Vectorize Text with TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
text_vectorized = vectorizer.fit_transform(df["summary"])

# Train the LDA model
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(text_vectorized)

# Get topics and vocabulary
topic_words = []
for topic_idx, topic in enumerate(lda.components_):
  topic_words.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])  # Top 10 words for each topic

# Analyze Topics
print(f"Topics identified:")
for i, topic in enumerate(topic_words):
  print(f"Topic {i+1}: {', '.join(topic)}")

# Print first N documents associated with each topic (example)
for topic_idx in range(num_topics):
  topic_docs = df.loc[df['summary'].apply(lambda x: lda.transform(vectorizer.transform([x]))[:, topic_idx] > 0.2).idxmax(), :]
  print(f"\nDocuments related to Topic {topic_idx+1}:")
  for i, row in topic_docs.iterrows():
    print(f"\t- Document ID: {row['id']}, Summary: {row['summary']}")

# Print extracted entities (example)
print("\nExtracted Entities:")
entity_counts = {}
for entities in df["entities"]:
  for entity in entities:
    entity_counts[entity] = entity_counts.get(entity, 0) + 1
sorted_entities = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)
print(f"\tTop 10 Most Frequent Entities:")
for i, (entity, count) in enumerate(sorted_entities[:10]):
  print(f"\t\t- {entity}: {count} mentions")


ValueError: [E178] Each pattern should be a list of dicts, but got: {'POS': 'PROPN'}. Maybe you accidentally passed a single pattern to Matcher.add instead of a list of patterns? If you only want to add one pattern, make sure to wrap it in a list. For example: `matcher.add('ORG', [pattern])`

## NER Spacy

In [10]:
import pandas as pd 
import spacy 
import requests 
from bs4 import BeautifulSoup
nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 200)


In [45]:
content = "Trinamool Congress leader Mahua Moitra has moved the Supreme Court against her expulsion from the Lok Sabha over the cash-for-query allegations against her. Moitra was ousted from the Parliament last week after the Ethics Committee of the Lok Sabha found her guilty of jeopardising national security by sharing her parliamentary portal's login credentials with businessman Darshan Hiranandani."
content = "SEC Charges Lordstown Motors with Misleading Investors about Company’s Flagship Electric Vehicle. The Securities and Exchange Commission today charged Lordstown Motors Corp. with misleading investors about the sales prospects of Lordstown’s flagship electric pickup truck, the Endurance. Lordstown, which filed for bankruptcy in 2023, went public by…"

for ii in range(10):
	# print(ii)
	doc = nlp(df['summary'][ii])

	for ent in doc.ents:
		print(ent.text, ent.start_char, ent.end_char, ent.label_)
	print('----------------------------------------------------------------')


SEC Charges Lordstown Motors 0 28 ORG
Misleading Investors 34 54 ORG
Company’s Flagship Electric Vehicle 61 96 ORG
The Securities and Exchange Commission 99 137 ORG
today 138 143 DATE
Lordstown Motors Corp. 152 174 ORG
Lordstown 230 239 GPE
Lordstown 289 298 GPE
2023 330 334 DATE
----------------------------------------------------------------
SEC Investor Advisory Committee 0 31 ORG
The Securities and Exchange Commission’s 150 190 ORG
Investor Advisory Committee 191 218 ORG
March 7 249 256 DATE
10 a.m. ET 260 270 TIME
the SEC Headquarters 274 294 FAC
Washington 298 308 GPE
D.C. 310 314 GPE
SEC 355 358 ORG
two 393 396 CARDINAL
----------------------------------------------------------------
Federal Court Orders Unregistered Pool Operator 0 47 ORG
Over $11 Million 73 89 MONEY
Forex Fraud 94 105 ORG
----------------------------------------------------------------
Federal Court Orders 0 20 ORG
Salesperson 75 86 PERSON
Over $56 Million 94 110 MONEY
-----------------------------------------

### Topic Modeling (Over Time):

Topic modeling is a technique for identifying latent thematic structures within a collection of documents. Here's how to use it to analyze trends in your article data over time:
Preprocess the Text: Clean and prepare your article summaries as mentioned earlier (date formatting, text cleaning).

Vectorize the Text: Convert the summaries into a numerical representation that a machine learning model can understand. This can be done using techniques like TF-IDF (Term Frequency-Inverse Document Frequency).

Run Topic Modeling: Apply a topic modeling algorithm like Latent Dirichlet Allocation (LDA) to the vectorized summaries. This will identify a set of topics and the probability of each word belonging to those topics.

Analyze Topics Over Time: Track how the prevalence of these topics changes over the specified time period. This can be done by looking at the distribution of topics within different timeframes (e.g., months, quarters). You can calculate the proportion of articles belonging to each topic for each timeframe and visualize the trends.

Tools like LatentDirichletAllocation in scikit-learn (Python) or stm (R) can be used for topic modeling.