In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("Dizex/FoodBaseBERT")
model = AutoModelForTokenClassification.from_pretrained("Dizex/FoodBaseBERT")
pipe = pipeline("ner", model=model, tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
example = "Today's meal: Fresh olive poke bowl topped with chia seeds. Very delicious!"

ner_entity_results = pipe(example)
print(ner_entity_results)

Device set to use mps:0


[{'entity': 'B-FOOD', 'score': np.float32(0.6692053), 'index': 6, 'word': 'Fresh', 'start': 14, 'end': 19}, {'entity': 'I-FOOD', 'score': np.float32(0.5334641), 'index': 7, 'word': 'olive', 'start': 20, 'end': 25}, {'entity': 'I-FOOD', 'score': np.float32(0.9861605), 'index': 8, 'word': 'p', 'start': 26, 'end': 27}, {'entity': 'I-FOOD', 'score': np.float32(0.9927375), 'index': 9, 'word': '##oke', 'start': 27, 'end': 30}, {'entity': 'I-FOOD', 'score': np.float32(0.97065824), 'index': 10, 'word': 'bowl', 'start': 31, 'end': 35}, {'entity': 'B-FOOD', 'score': np.float32(0.9957469), 'index': 13, 'word': 'ch', 'start': 48, 'end': 50}, {'entity': 'B-FOOD', 'score': np.float32(0.97199684), 'index': 14, 'word': '##ia', 'start': 50, 'end': 52}, {'entity': 'I-FOOD', 'score': np.float32(0.99487525), 'index': 15, 'word': 'seeds', 'start': 53, 'end': 58}]


In [5]:
foods = []
current_food = ""

for ent in ner_entity_results:
    word = ent["word"]
    # Handle wordpieces (like ##oke -> oke)
    if word.startswith("##"):
        word = word[2:]
    
    if ent["entity"].startswith("B-FOOD"):
        if current_food:
            foods.append(current_food.strip())
        current_food = word + " "
    elif ent["entity"].startswith("I-FOOD"):
        current_food += word + " "

# Add last entity
if current_food:
    foods.append(current_food.strip())

# Deduplicate and clean
foods = [f.replace(" ##", "").replace("  ", " ").strip() for f in foods]
foods = list(dict.fromkeys(foods))  # preserve order

# Store in a dictionary
food_dict = {"foods": foods}
print(food_dict)

{'foods': ['Fresh olive p oke bowl', 'ch', 'ia seeds']}


In [3]:
import spacy
from dateparser import parse

nlp = spacy.load("en_core_web_sm")

def extract_entities(entry):
    doc = nlp(entry)
    locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    dates = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    parsed_dates = [parse(d) for d in dates if parse(d)]
    return {"locations": locations, "dates": parsed_dates}

print(extract_entities("Flew to Berlin on June 3rd"))
print(extract_entities("Went to MD, then had tacos"))
# -> {'locations': ['Berlin'], 'dates': [datetime(2024, 6, 3, 0, 0)]}


{'locations': ['Berlin'], 'dates': [datetime.datetime(2025, 6, 3, 0, 0)]}
{'locations': ['MD'], 'dates': []}


In [9]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import chromadb
import pandas as pd
import numpy as np

df = pd.read_excel('data/cleaned/daily_sentences_cleaned.xlsx')

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def get_embeddings(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
        masked_embeddings = embeddings * mask
        summed = torch.sum(masked_embeddings, 1)
        counts = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / counts
        return mean_pooled.cpu().numpy()

embeddings = get_embeddings(df['Sentence'].tolist())

client = chromadb.Client()
collection = client.create_collection(name="journal2")

collection.add(
    embeddings=embeddings.tolist(),
    documents=df['Sentence'].tolist(),
    metadatas=[{'date': str(d)} for d in df['Date']],
    ids=[str(i) for i in range(len(df))]
)

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def ask(question, k=5):
    q_emb = get_embeddings([question])
    results = collection.query(query_embeddings=q_emb.tolist(), n_results=k)
    
    docs = results['documents'][0]
    metas = results['metadatas'][0]
    
    # Combine context with dates
    combined = "\n".join([f"{m['date']}: {d}" for d, m in zip(docs, metas)])
    
    # Summarize retrieved context
    summary = summarizer(
        combined,
        max_length=130,
        min_length=30,
        do_sample=False
    )[0]['summary_text']
    
    return summary


Device set to use mps:0


In [13]:
ask("What's my favorite food?")

'Made mongolian beef for lunch, then went into office. Got free taco bell. Then made chicken and dumplings for dinner. Went to gym then got popeyes wings. Then watched bad batch.'

In [7]:
import spacy

# Load the pre-trained English model (includes NER)
nlp = spacy.load("en_core_web_lg")

# Example text
text = '''I ordered a new MacBook Pro and a pair of Bose QuietComfort headphones last week.  
The Samsung Galaxy S24 Ultra has an amazing camera.  
We replaced our old Whirlpool dishwasher with a KitchenAid one.  
He bought LEGO Star Wars sets for his nephew.  
The Apple Watch Ultra 2 syncs perfectly with my iPhone.  
I used a Canon EOS R8 for the shoot and edited it on a Dell XPS 15.  
She picked up a bottle of Coca-Cola and a pack of Oreos from the store.  
I played the game on my Nintendo Switch and later tried it on PlayStation 5.  
My friend swears by the Dyson Airwrap for her hair.  
The Tesla Model Y’s touchscreen feels smoother than before.'''

# Process the text
doc = nlp(text)

# Extract entities labeled as PRODUCT
products = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]

for ent in doc.ents:
    print(ent.text, ent.label_)

print(products)

MacBook Pro ORG
Bose QuietComfort PRODUCT
last week DATE
Samsung ORG
Whirlpool PRODUCT
KitchenAid ORG
LEGO Star Wars PRODUCT
The Apple Watch Ultra 2 ORG
Dell ORG
XPS 15 PRODUCT
Coca-Cola ORG
Oreos PRODUCT
Nintendo Switch ORG
PlayStation 5 PRODUCT
the Dyson Airwrap PERSON
Tesla Model ORG
['Bose QuietComfort', 'Whirlpool', 'LEGO Star Wars', 'XPS 15', 'Oreos', 'PlayStation 5']


In [8]:
text = 'Got free taco bell. Then made chicken and dumplings for dinner.'
# Process the text
doc = nlp(text)

# Extract entities labeled as PRODUCT
products = [ent.text for ent in doc.ents if ent.label_ == "PRODUCT"]

for ent in doc.ents:
    print(ent.text, ent.label_)

print(products)

[]


In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import pandas as pd

df = pd.read_excel('data/cleaned/daily_sentences_cleaned.xlsx')

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

# Use aggregation_strategy="simple" to merge tokens into full entities
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to extract entities
def extract_entities(text):
    if not isinstance(text, str) or text.strip() == "":
        return {"locations": [], "dates": []}
    entities = nlp(text)
    locations = [e["word"] for e in entities if e["entity_group"] in ["LOC", "GPE"]]
    dates = [e["word"] for e in entities if e["entity_group"] == "DATE"]
    return {"locations": locations, "dates": dates}

df = df["Sentence"].apply(extract_entities)
df = pd.concat([df, df.apply(pd.Series)], axis=1)

print(df)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


                                     Sentence  locations dates
0              {'locations': [], 'dates': []}         []    []
1              {'locations': [], 'dates': []}         []    []
2              {'locations': [], 'dates': []}         []    []
3              {'locations': [], 'dates': []}         []    []
4              {'locations': [], 'dates': []}         []    []
...                                       ...        ...   ...
1294  {'locations': ['England'], 'dates': []}  [England]    []
1295           {'locations': [], 'dates': []}         []    []
1296           {'locations': [], 'dates': []}         []    []
1297           {'locations': [], 'dates': []}         []    []
1298           {'locations': [], 'dates': []}         []    []

[1299 rows x 3 columns]


In [9]:
all_locations = set(
    loc
    for sublist in df["locations"]
    if isinstance(sublist, list)
    for loc in sublist
)

# Convert back to list (if you want)
unique_locations = sorted(all_locations)

print(unique_locations)
print(f"Total unique locations: {len(unique_locations)}")

['##VA', '##aithersburg', '##ayuga', '##el Tower', '##es', '##gmans', '##gt', '##h', '##il', '##jin', '##lum', '##mofo', '##nan', '##oanoke', '##r America', '##ratton', '##richa', '##riott', '##rlin', '##s', '##sitano', '##to', '##town', '##yang', 'Amazon', 'Amma', 'Ammas', 'Anneheim', 'Argentina', 'Arlington', 'Asukasa', 'Baltimore', 'Belluno', 'Belluno National Park', 'Belmont Bay', 'Ben Gongs', 'Berlin', 'Bethlehem', 'Blacksburg', 'Boston', 'Bradley', 'Brandon ’ s', 'Brazil', 'Bryant Park', 'Busch Gardens', 'C', 'Cali', 'Canada', 'Cancun', 'Carcassone', 'Castle', 'Central Park', 'Chantilly', 'Chantilly Bible Church', 'Charleston', 'Chinatown', 'Chincoteague', 'Church', 'Clarendon', 'Colombia', 'Coronado', 'Cortina', 'Coyote Lake', 'Croatia', 'Culpeper', 'DC', 'Dam', 'Dee', 'Delaware', 'Dontur', 'Dortmund', 'Due South', 'Dulles', 'East', 'Eiff', 'Elise', 'Elises', 'England', 'Europe', 'Fairfax', 'Fburg', 'Fin', 'Florence', 'Flushing', 'Fontana', 'France', 'Fredericksburg', 'Georgetow