In [1]:
import pandas as pd
import numpy as np
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    pipeline
)
import torch
from tqdm import tqdm
import multiprocessing as mp
from collections import defaultdict
import math

In [2]:
df = pd.read_csv("data_with_sentiments.csv")

In [4]:
print(df['text'].apply(type).unique())

[<class 'str'> <class 'float'>]


In [5]:
# Identify rows where the type is float
float_rows = df[df['text'].apply(type) == float]

# Print the rows with float data type
print("Rows with float values:")
print(float_rows.head())

Rows with float values:
       Unnamed: 0                                                url  \
29616       31770  https://www.mondaq.com:443/unitedstates/licens...   
37172       39836  https://www.mondaq.com:443/canada/copyright/10...   
59093       63285              https://file770.com/copyright-and-ai/   
61721       66115  https://www.windowscentral.com/software-apps/a...   
88360       94643  https://www.law360.com/articles/1697413/copyri...   

             date language                                              title  \
29616  2024-02-01       en  Copyright Office Seeks Comments On Compulsory ...   
37172  2021-08-05       en  Copyright Act Consultation To Address Artifici...   
59093  2023-04-29       en                      Copyright and AI | File 770\t   
61721  2024-05-01       en  Copyright infringement continues to be a pain ...   
88360  2023-07-10       en  Copyright Precautions For AI Content After War...   

      text  sentiment_score sentiment_label  
29616  NaN

In [67]:
device = 0 if torch.cuda.is_available() else -1

In [68]:
device

0

In [69]:
import torch
from transformers import pipeline
from collections import defaultdict
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

def setup_pipelines(model="dslim/bert-base-NER", batch_size=32):
    """Setup NER pipeline with GPU support"""
    device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
    ner_pipeline = pipeline(
        "ner",
        model=model,
        tokenizer=model,
        device=device,  # Enable GPU
        batch_size=batch_size,
        aggregation_strategy="none"
    )
    return ner_pipeline

def process_batch(texts, ner_pipe, sentiment_scores, dates):
    """Process a batch of texts for entity extraction"""
    entity_sentiments = defaultdict(list)
    try:
        # Get NER results for the batch
        ner_results = ner_pipe(texts)

        # Process each text's entities
        for text_idx, entities in enumerate(ner_results):
            process_entities(
                entities,
                entity_sentiments,
                sentiment_scores[text_idx],
                dates[text_idx]
            )
    except Exception as e:
        print(f"Error processing batch: {str(e)}")

    return entity_sentiments

def process_entities(entities, entity_sentiments, sentiment_score, date):
    """Process entities from a single text and associate them with a date."""
    current_entity = ""
    current_type = ""

    for entity in entities:
        entity_type = entity['entity']

        if entity_type.startswith('B-'):
            # Finalize the previous entity if it exists
            if current_entity:
                add_entity_with_date(
                    current_entity.strip(),
                    current_type,
                    entity_sentiments,
                    sentiment_score,
                    date
                )
            # Start a new entity
            current_entity = entity['word'].replace('##', '').strip()
            current_type = entity_type[2:]  # Extract the type (e.g., 'ORG', 'PER')

        elif entity_type.startswith('I-') and current_type == entity_type[2:]:
            # Continue the current entity
            current_entity += " " + entity['word'].replace('##', '').strip()

        else:
            # If it's 'O' or mismatched 'I-*', finalize the current entity
            if current_entity:
                add_entity_with_date(
                    current_entity.strip(),
                    current_type,
                    entity_sentiments,
                    sentiment_score,
                    date
                )
                current_entity = ""
                current_type = ""

    # Add the last entity
    if current_entity:
        add_entity_with_date(
            current_entity.strip(),
            current_type,
            entity_sentiments,
            sentiment_score,
            date
        )

def add_entity_with_date(entity, entity_type, entity_sentiments, sentiment_score, date):
    """Add an entity with its sentiment score and date to the entity_sentiments dictionary."""
    if entity_type in ['PER', 'ORG', 'MISC']:
        mapped_type = {
            'PER': 'People',
            'ORG': 'Organizations',
            'MISC': 'Technologies'
        }[entity_type]
        entity_sentiments[(mapped_type, entity)].append({'sentiment': sentiment_score, 'date': date})

def analyze_entities(df, batch_size=32):
    """Process entity extraction and sentiment analysis on GPU"""
    # Setup NER pipeline
    ner_pipe = setup_pipelines(batch_size=batch_size)

    # Initialize combined results
    combined_sentiments = defaultdict(list)

    # Process the data in batches
    num_batches = math.ceil(len(df) / batch_size)
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_texts = df['text'][start_idx:end_idx].tolist()
        batch_sentiments = df['sentiment_score'][start_idx:end_idx].tolist()
        batch_dates = df['date'][start_idx:end_idx].tolist()

        # Process batch
        batch_results = process_batch(batch_texts, ner_pipe, batch_sentiments, batch_dates)

        # Merge batch results into combined results
        for (entity_type, entity), sentiments in batch_results.items():
            combined_sentiments[(entity_type, entity)].extend(sentiments)

    # Create summary dataframe
    summary = []
    for (entity_type, entity), sentiments in combined_sentiments.items():
        avg_sentiment = np.mean([s['sentiment'] for s in sentiments])
        mentions = len(sentiments)
        sentiment_std = np.std([s['sentiment'] for s in sentiments]) if mentions > 1 else 0

        summary.append({
            'entity_type': entity_type,
            'entity': entity,
            'avg_sentiment': avg_sentiment,
            'sentiment_std': sentiment_std,
            'mentions': mentions,
            'sentiment_label': 'Positive' if avg_sentiment > 0 else 'Negative',
            'first_mention_date': min([s['date'] for s in sentiments]),
            'last_mention_date': max([s['date'] for s in sentiments])
        })

    return pd.DataFrame(summary)

def plot_timeline(summary_df, entity_type=None):
    """Plot a timeline to illustrate sentiment changes or entity mentions over time."""
    if entity_type:
        data = summary_df[summary_df['entity_type'] == entity_type]
    else:
        data = summary_df

    for _, row in data.iterrows():
        entity = row['entity']
        dates = [s['date'] for s in combined_sentiments[(row['entity_type'], entity)]]
        sentiments = [s['sentiment'] for s in combined_sentiments[(row['entity_type'], entity)]]

        plt.plot(dates, sentiments, marker='o', label=entity)

    plt.title(f"Timeline for {entity_type if entity_type else 'All Entities'}")
    plt.xlabel("Date")
    plt.ylabel("Sentiment Score")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()


In [70]:
pd.options.display.max_rows = 100

In [None]:
# Usage example
batch_size = 32  # Adjust based on GPU memory

# Process the data
entity_summary = analyze_entities(df, batch_size=batch_size)




Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error processing batch: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Error processing batch: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Error processing batch: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Error processing batch: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Error processing batch: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Error processing batch: text input must be of type `str` (single example), `List[str]` (batch o

In [72]:
entity_summary.to_csv("entity_summary")

In [7]:
entity_summary = pd.read_csv("entity_summary")

In [11]:
entity_summary

Unnamed: 0.1,Unnamed: 0,entity_type,entity,avg_sentiment,sentiment_std,mentions,sentiment_label,first_mention_date,last_mention_date
0,0,Organizations,NASA Watch,0.994900,0.000000,1,Positive,2021-07-05,2021-07-05
1,1,Organizations,Daily News lette,0.931764,0.334446,135,Positive,2020-05-25,2024-07-12
2,2,Organizations,International Space Station,0.978600,0.052882,14,Positive,2020-05-25,2024-02-05
3,3,Organizations,NASA,0.862361,0.444931,1097,Positive,2020-01-10,2024-11-05
4,4,Technologies,AI,0.855550,0.457312,208630,Positive,2020-01-01,2024-11-07
...,...,...,...,...,...,...,...,...,...
583803,583803,Organizations,K hara,0.999300,0.000000,1,Positive,2023-11-07,2023-11-07
583804,583804,Organizations,Ke et mans ho op,0.999300,0.000000,1,Positive,2023-11-07,2023-11-07
583805,583805,Organizations,N dak olo,0.999300,0.000000,1,Positive,2023-11-07,2023-11-07
583806,583806,Organizations,Men E P Junior,0.999300,0.000000,1,Positive,2023-11-07,2023-11-07


In [17]:
entity_summary[entity_summary['entity'] == 'AI']

Unnamed: 0.1,Unnamed: 0,entity_type,entity,avg_sentiment,sentiment_std,mentions,sentiment_label,first_mention_date,last_mention_date


In [13]:
entity_summary = entity_summary[entity_summary['sentiment_label'] == "negative"]

In [14]:
# Sort and display results
for entity_type in ['People', 'Organizations', 'Technologies']:
    print(f"\n=== {entity_type} Analysis ===")
    type_entities = entity_summary[entity_summary['entity_type'] == entity_type]
    print("\nTop 10 most mentioned entities:")
    print(type_entities.nlargest(50, 'mentions')[
        ['entity', 'mentions', 'avg_sentiment', 'sentiment_label']
    ])


=== People Analysis ===

Top 10 most mentioned entities:
Empty DataFrame
Columns: [entity, mentions, avg_sentiment, sentiment_label]
Index: []

=== Organizations Analysis ===

Top 10 most mentioned entities:
Empty DataFrame
Columns: [entity, mentions, avg_sentiment, sentiment_label]
Index: []

=== Technologies Analysis ===

Top 10 most mentioned entities:
Empty DataFrame
Columns: [entity, mentions, avg_sentiment, sentiment_label]
Index: []
