In [1]:
%load_ext autoreload
%autoreload 2

from trend_detection import TrendDetector
import pandas as pd
import time
import random

from openai import AzureOpenAI
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

from trend_detection_embeddings import TrendDetectorEmbeddings

from preprocessing import preprocess_text
from profiler import profile_function

[nltk_data] Downloading package punkt to /Users/viktor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/viktor/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/viktor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/viktor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
df_local[['text']].to_json('data/local_events_messages_3.json', orient='records')

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [17]:
df_local = pd.read_json('data/local_events_messages_2.json')
processed_messages = []

for m in df_local["text"]:
    processed_messages.append(preprocess_text(m))

df_local["text_processed"] = processed_messages
embeddings = 
df_local['embeddings'] = model.encode(df_local["text_processed"]).tolist()

In [3]:
def split_train_test(df, train_size=0.5):
    train_idx = []
    test_idx = []
    
    for event_name, message_idx in df.groupby('event_name').groups.items():
        train, test = train_test_split(message_idx, train_size=train_size)
        train_idx.extend(train)
        test_idx.extend(test)
    
    train_df = df.iloc[train_idx]
    test_df = df.iloc[test_idx]

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    return train_df, test_df

In [18]:
df1, df2 = split_train_test(df_local, 0.5)
df1.shape, df2.shape

((1050, 6), (1050, 6))

In [24]:
def run_trend_detection(df):

    td = TrendDetectorEmbeddings(model=model)
    
    def myfunction():
      return 0.1
    
    messages = list(df["text_processed"])
    random.shuffle(messages, random=myfunction)
    
    for i, message in enumerate(messages):
        td.process_message(message, "LA", time.time())
    
    return td

In [25]:
td1 = run_trend_detection(df1)

since Python 3.9 and will be removed in a subsequent version.
  random.shuffle(messages, random=myfunction)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


New Trend created: tokyo, hanami, sakura, spring, cherry
New Trend created: karlthefog, dont, flight, delays, fogcity
New Trend created: pasta, italianfood, romefoodfest, pastafestival, food
New Trend created: food, street, streetfood, thailand, bangkokfood
New Trend created: festival, brazil, riocarnival, rio, preevent
New Trend created: education, santiago, students, chileprotests, peaceful
New Trend created: street, barcelona, quarter, artisanmarket, gothic
New Trend created: sydney, sydneyblackout, traffic, poweroutage, candlelit
New Trend created: commuterproblems, torontotransit, ttcalert, chaos, subway
New Trend created: transit, berlintransport, berlinstrike, strike, worker
New Trend created: weather, rainbow, traffic, dublinrainbow, ireland
New Trend created: popup, foodieheaven, parisfood, restaurant, paris
New Trend created: kpop, seoulflashmob, traffic, gangnamstation, station
New Trend created: stem, projects, young, innovators, londonscifair
New Trend created: celebration

In [26]:
td2 = run_trend_detection(df2)

since Python 3.9 and will be removed in a subsequent version.
  random.shuffle(messages, random=myfunction)


New Trend created: tokyo, hanami, sakura, temples, parks
New Trend created: fogcity, karlthefog, delays, flight, views
New Trend created: pasta, italianfood, pastafestival, romefoodfest, varieties
New Trend created: food, thailand, bangkokfood, streetfood, market
New Trend created: festival, brazil, riocarnival, rio, beach
New Trend created: education, santiago, chileprotests, reform, students
New Trend created: street, barcelona, music, localcraft, gothic
New Trend created: sydney, sydneyblackout, poweroutage, traffic, candlelit
New Trend created: torontotransit, commuterproblems, ttcalert, subway, chaos
New Trend created: transit, berlintransport, berlinstrike, bike, strike
New Trend created: weather, rainbow, dublinrainbow, ireland, traffic
New Trend created: popup, foodieheaven, parisfood, surprise, famous
New Trend created: kpop, dance, traffic, gangnam, seoulflashmob
New Trend created: stem, young, londonscifair, younginnovators, projects
New Trend created: celebration, champions

In [32]:
def check_cosine_similarity(c1, c2):
    # Reshape to 2D arrays for sklearn
    c1 = np.array(c1).reshape(1, -1)
    c2 = np.array(c2).reshape(1, -1)
    return cosine_similarity(c1, c2)[0][0]

In [34]:
import itertools

In [36]:
for t1, t2 in itertools.product(td1.trends.values(), td2.trends.values()):
    similarity = check_cosine_similarity(t1.centroid, t2.centroid)
    if similarity > 0.8:
        print(t1.keywords, t2.keywords)

['tokyo', 'hanami', 'sakura', 'spring', 'peak'] ['tokyo', 'hanami', 'sakura', 'peak', 'spring']
['karlthefog', 'fogcity', 'delays', 'flight', 'dont'] ['fogcity', 'delays', 'karlthefog', 'flight', 'views']
['pasta', 'italianfood', 'romefoodfest', 'pastafestival', 'food'] ['pasta', 'romefoodfest', 'italianfood', 'pastafestival', 'historic']
['food', 'street', 'bangkokfood', 'market', 'streetfood'] ['food', 'thailand', 'bangkokfood', 'streetfood', 'market']
['festival', 'brazil', 'riocarnival', 'parades', 'costumes'] ['festival', 'brazil', 'riocarnival', 'parades', 'packed']
['education', 'santiago', 'chileprotests', 'students', 'reform'] ['education', 'santiago', 'chileprotests', 'reform', 'center']
['street', 'barcelona', 'local', 'market', 'localcraft'] ['street', 'barcelona', 'music', 'local', 'crafts']
['sydney', 'traffic', 'sydneyblackout', 'poweroutage', 'cbd'] ['sydney', 'sydneyblackout', 'poweroutage', 'candlelit', 'traffic']
['subway', 'commuterproblems', 'ttcalert', 'torontotra