In [91]:
import stanza, spacy, nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from neo4j import GraphDatabase
import string

In [92]:
uri = "bolt://localhost:7687"  
username = "neo4j"
password = "secondpaper"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [93]:
stanza.download('it')
stanza.download('en')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

language = input('Language: ')
if language == 'en':
    stop_words = set(stopwords.words('english'))
    dataset_path_input = "../dataset/cleaned_tweets/cleaned_english_tweets.csv"
elif language == 'it':
    stop_words = set(stopwords.words('italian'))
    dataset_path_input = "../dataset/cleaned_tweets/cleaned_italian_tweets.csv"

tweets = pd.read_csv(dataset_path_input, sep=';')
cleaned_tweet = tweets['cleaned_tweets']

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 21.1MB/s]                    
2024-11-13 11:02:42 INFO: Downloaded file to /home/lucianoimbimbo/stanza_resources/resources.json
2024-11-13 11:02:42 INFO: Downloading default packages for language: it (Italian) ...
2024-11-13 11:02:44 INFO: File exists: /home/lucianoimbimbo/stanza_resources/it/default.zip
2024-11-13 11:02:48 INFO: Finished downloading models and saved to /home/lucianoimbimbo/stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 32.3MB/s]                    
2024-11-13 11:02:48 INFO: Downloaded file to /home/lucianoimbimbo/stanza_resources/resources.json
2024-11-13 11:02:48 INFO: Downloading default packages for language: en (English) ...
2024-11-13 11:02:50 INFO: File exists: /home/lucianoimbimbo/stanza_resources/en/default.zip
2024-11-13 11:02:55 INFO: Finished downloading mode

In [94]:
def drop_all_nodes(tx):
    tx.run("""MATCH (n) DETACH DELETE n""")

In [95]:
# stanza ner
def extract_entities_with_context_stanza(tweet,language, k=2):

    nlp = stanza.Pipeline(language, processors='tokenize,ner')
    
    punctuation = ".,;(){}[]"
    translator = str.maketrans('', '', punctuation)
    tweet = tweet.translate(translator)
    
    doc = nlp(tweet)
    
    entities_with_context = []

    # Flattened token list
    all_tokens = [token.text for sent in doc.sentences for token in sent.tokens]
    
    for sent in doc.sentences:
        for ent in sent.ents:

            # Find the start and end index of the entity in the flattened token list
            start_index = all_tokens.index(ent.tokens[0].text)
            end_index = start_index + len(ent.tokens)
            
            # Extract context 
            context_before = [all_tokens[i] for i in range(max(0, start_index-k), start_index) if all_tokens[i].lower() not in stop_words]
            context_after = [all_tokens[i] for i in range(end_index, min(len(all_tokens), end_index+k)) if all_tokens[i].lower() not in stop_words]
            
            entities_with_context.append((ent.text, ent.type, context_before, context_after))
    
    print(entities_with_context)
    return entities_with_context

In [96]:
# spacy ner
def extract_entities_with_context_spacy(tweet: str, language: str = "en", k: int = 2):
   
    if language == "en":
        nlp = spacy.load("en_core_web_sm")
    elif language == "it":
        nlp = spacy.load("it_core_news_sm")
    else:
        raise ValueError(f"Language {language} not supported")
    
    translator = str.maketrans('', '', string.punctuation)
    clean_tweet = tweet.translate(translator)
    
    doc = nlp(clean_tweet)
    stop_words = nlp.Defaults.stop_words
    
    all_tokens = [token.text for token in doc]
    entities_with_context = []
    
    for ent in doc.ents:
        # Find entity position
        start_index = all_tokens.index(ent[0].text)
        end_index = start_index + len(ent)
        
        context_before = [all_tokens[i] for i in range(max(0, start_index-k), start_index) 
                         if all_tokens[i].lower() not in stop_words]
        context_after = [all_tokens[i] for i in range(end_index, min(len(all_tokens), end_index+k)) 
                        if all_tokens[i].lower() not in stop_words]
        
        entities_with_context.append((ent.text, ent.label_, context_before, context_after))
    
    return entities_with_context

In [97]:
# nltk ner
def extract_entities_with_context_nltk(tweet: str, language: str = "italian", k: int = 2):
    # Get stop words for Italian
    try:
        stop_words = set(nltk.corpus.stopwords.words(language))
    except:
        # Fallback to English if language pack not available
        stop_words = set(nltk.corpus.stopwords.words('english'))
    
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    clean_tweet = tweet.translate(translator)
    
    # Tokenize
    tokens = word_tokenize(clean_tweet)
    
    # For Italian, we'll use a simplified approach since NLTK's NER is primarily for English
    # We'll look for capitalized words as potential entities
    entities_with_context = []
    
    for i, token in enumerate(tokens):
        # Consider capitalized words as potential entities (simplified approach)
        if token and token[0].isupper() and token.lower() not in stop_words:
            # Get context
            context_before = [tokens[j] for j in range(max(0, i-k), i) 
                            if tokens[j].lower() not in stop_words]
            context_after = [tokens[j] for j in range(i+1, min(len(tokens), i+1+k)) 
                           if tokens[j].lower() not in stop_words]
            
            entities_with_context.append((token, 'NE', context_before, context_after))
    
    return entities_with_context

In [98]:
# Function to create nodes and edges in the Neo4j graph
def create_neo4j_graph(tx, entities_with_context):
    for entity,label,context_before, context_after in entities_with_context:
        # Treat NE as a term (same node type)
        tx.run("MERGE (n:Term {name: $entity, label: $label})", entity=entity, label=label)
        
        # Create directed edges from context terms before the NE (predecessors) to the NE
        for term in context_before:
            tx.run("MERGE (t:Term {name: $term})", term=term)
            tx.run("""
                MATCH (n:Term {name: $entity}), (t:Term {name: $term})
                MERGE (t)-[r:POINTS_TO]->(n)
                ON CREATE SET r.weight = 1
                ON MATCH SET r.weight = r.weight + 1
                """, entity=entity, term=term)
        
        # Create directed edges from the NE to the context terms after the NE (successors)
        for term in context_after:
            tx.run("MERGE (t:Term {name: $term})", term=term)
            tx.run("""
                MATCH (n:Term {name: $entity}), (t:Term {name: $term})
                MERGE (n)-[r:POINTS_TO]->(t)
                ON CREATE SET r.weight = 1
                ON MATCH SET r.weight = r.weight + 1
                """, entity=entity, term=term)

In [99]:
with driver.session() as session:
    session.execute_write(drop_all_nodes)
    for tweet in cleaned_tweet.to_list():
        entities_with_context = extract_entities_with_context_nltk(tweet,language)
        session.execute_write(create_neo4j_graph, entities_with_context)

driver.close()