In [1]:
import polars as pl

df_parla = pl.read_parquet("./ParlaMind.parquet")

df_parla = df_parla.filter(pl.col("speechContent").is_null() == False)

In [2]:
from germansentiment import SentimentModel
from tqdm import tqdm
import torch
import spacy

model = SentimentModel()
nlp = spacy.load("de_core_news_lg", disable=["ner", "tagger"])

batch_size = 512
data_num = df_parla.height

pos_results, neg_results, neut_results, major_class, sentiment_per_sentence = [], [], [], [], []

for start_batch in tqdm(range(0, data_num, batch_size), desc="Processing Batches"):
    df_batch = df_parla.slice(start_batch, batch_size)

    speeches_batch = df_batch["speechContent"].to_list()
    
    result = model.predict_sentiment(speeches_batch, output_probabilities=True)

    torch.cuda.empty_cache()

    '''
    sentences_speeches = []
    for speech in speeches_batch:    
        doc = nlp(speech)
        sentences = [sent.text for sent in doc.sents]
        for sentence in sentences:
            sentences_speeches.append(sentence)

    sentence_results = model.predict_sentiment(sentences_speeches)
        
        
    torch.cuda.empty_cache()
    sentiment_per_sentence.append(sentence_results) 
    '''
    pos_results.extend([prob[0][1] for prob in result[1]])
    neg_results.extend([prob[1][1] for prob in result[1]])
    neut_results.extend([prob[2][1] for prob in result[1]])
    major_class.extend(result[0])


KeyboardInterrupt: 

In [None]:
sentiment_per_sentence

In [None]:
df_parla = df_parla.with_columns(pl.Series(name="sent_pos", values=pos_results)) 
df_parla = df_parla.with_columns(pl.Series(name="sent_neg", values=neg_results)) 
df_parla = df_parla.with_columns(pl.Series(name="sent_neu", values=neut_results)) 
df_parla = df_parla.with_columns(pl.Series(name="sent_pred", values=major_class)) 

In [None]:
df_parla

In [None]:
#df_parla.write_parquet("ParlaMind_sentiment.parquet")

In [None]:
import polars as pl
df_parla = pl.read_parquet("ParlaMind_sentiment.parquet")

In [None]:
filtered_df = df_parla.filter(df_parla["speechContent"].str.len_chars() >= 40)

In [None]:
filtered_df

In [None]:
sentiment_counts = (
    filtered_df
    .group_by(["abbreviation", "sent_pred"])
    .len()
    .rename({"len": "count"})
)

total_counts = (
    filtered_df
    .group_by("abbreviation")
    .len()
    .rename({"len": "total"})
)

result = (
    sentiment_counts
    .join(total_counts, on="abbreviation")
    .with_columns((pl.col("count") / pl.col("total") * 100).alias("percentage"))
    .pivot(values="percentage", index="abbreviation", columns="sent_pred")
    .fill_null(0)  # Fill missing values with 0 if a party has no posts for a sentiment
)

In [None]:
result

In [None]:
print(result.sort("positive"))

In [None]:
df_parla

In [None]:
df = filtered_df.with_columns(
    pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").alias("date_parsed")
).with_columns(
    pl.col("date_parsed").dt.year().alias("year")
)

df_sentiment_year = df.group_by(["year", "sent_pred"]).agg(
    pl.len().alias("sentiment_count")
)


In [None]:
df_sentiment_year = df_sentiment_year.sort("year")

In [None]:
import plotly.express as px


fig = px.line(df_sentiment_year, x="year", y="sentiment_count", color='sent_pred')
fig.show()

In [None]:
df_parla

In [None]:
import spacy
import polars as pl
from tqdm import tqdm
import torch

# Read data
#df_parla = pl.read_parquet("br_sentiment.parquet")


nlp = spacy.load("de_core_news_md", disable=[
    "ner",
    "lemmatizer",
    "attribute_ruler",
    "tagger",
    "morphologizer"
])

def process_batch(speeches_batch):
    docs = nlp.pipe(
        speeches_batch,
        n_process=-1,  
    )
    return [[sent.text for sent in doc.sents] for doc in docs]

batch_size = 4096  
data_num = df_parla.height

sentences_results = [None] * data_num

for start_batch in tqdm(range(0, data_num, batch_size), desc="Processing Batches"):
    end_batch = min(start_batch + batch_size, data_num)
    
    df_batch = df_parla.slice(start_batch, end_batch - start_batch)
    speeches_batch = df_batch["speechContent"].to_list()
    
    batch_sentences = process_batch(speeches_batch)
    
    sentences_results[start_batch:end_batch] = batch_sentences
    

df_parla = df_parla.with_columns(pl.Series("sentences", sentences_results))


In [None]:
df_parla

In [None]:
#df_parla.write_parquet("ParlaMind_sentiment_sentence_split.parquet")

In [None]:
import polars as pl
#df_parla = pl.read_parquet("br_sentiment_sentence_split.parquet")

In [None]:
df_parla

In [None]:
from germansentiment import SentimentModel
from tqdm import tqdm
import torch

model = SentimentModel()

batch_size = 512
data_num = df_parla.height

pos_results, neg_results, neut_results, major_class = [], [], [], []
sentences_predictions = []  

for start_batch in tqdm(range(0, data_num, batch_size), desc="Processing Batches"):
    df_batch = df_parla.slice(start_batch, batch_size)
    
    for sentences in df_batch["sentences"]:
        if len(sentences) > 1:
            result_sentences = model.predict_sentiment(sentences, output_probabilities=False)
            sentences_predictions.append(result_sentences)
        else:
            sentences_predictions.append([])

    torch.cuda.empty_cache()

df_parla = df_parla.with_columns([
    pl.Series("sentences_sentiment", sentences_predictions)
])


In [None]:
df_parla

In [None]:
df_parla.write_parquet("ParlaMind_all.parquet") # this is needed for sentiment_plots

In [None]:
df_parla

In [None]:
import spacy
import polars as pl
from tqdm import tqdm
import torch

df_parla = pl.read_parquet("br_sentiment.parquet")


nlp = spacy.load("de_core_news_md", disable=[
    "ner",
    "lemmatizer",
    "attribute_ruler",
    "tagger",
    "morphologizer"
])

def process_batch(speeches_batch):
    docs = nlp.pipe(
        speeches_batch,
        n_process=-1,  
    )
    return [[sent.text for sent in doc.sents] for doc in docs]

batch_size = 4096  
data_num = df_parla.height

sentences_results = [None] * data_num

for start_batch in tqdm(range(0, data_num, batch_size), desc="Processing Batches"):
    end_batch = min(start_batch + batch_size, data_num)
    
    df_batch = df_parla.slice(start_batch, end_batch - start_batch)
    speeches_batch = df_batch["speechContent"].to_list()
    
    batch_sentences = process_batch(speeches_batch)
    
    sentences_results[start_batch:end_batch] = batch_sentences
    

df_parla = df_parla.with_columns(pl.Series("sentences", sentences_results))


In [None]:
df_parla_top = pl.read_parquet("top.parquet")

In [None]:
pos_list = []
neu_list = []
neg_list = []
for sentence_sentiment in df_parla_top.select("sentences_sentiment").to_series():
    pos = 0
    neu = 0
    neg = 0
    for sentiment in sentence_sentiment:
        if sentiment == "positive":
            pos += 1
        elif sentiment == "neutral":
            neu += 1
        elif sentiment == "negative":
            neg += 1

    pos_list.append(pos)
    neu_list.append(neu)
    neg_list.append(neg)

In [None]:
df_parla_top = df_parla_top.with_columns(pl.Series(name="pos_per_sentence", values=pos_list)) 
df_parla_top = df_parla_top.with_columns(pl.Series(name="neg_per_sentence", values=neg_list)) 
df_parla_top = df_parla_top.with_columns(pl.Series(name="neu_per_sentence", values=neu_list)) 

In [None]:
df_parla_top = df_parla_top.filter(df_parla_top["speechContent"].str.len_chars() >= 40)

In [None]:
df = df_parla_top.with_columns(
    pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").alias("date_parsed")
).with_columns(
    pl.col("date_parsed").dt.year().alias("year")
)

df_parla_top_year = df.group_by(["year", "pos_per_sentence", "neg_per_sentence", "neu_per_sentence"]).agg(
    pl.len().alias("sentiment_count")
)


In [None]:
df_parla_top_year = df_parla_top_year.sort("year")

In [None]:
df_parla_top_year

In [None]:
df_yearly_sum = df_parla_top_year.group_by("year").agg([
    pl.col("pos_per_sentence").sum(),
    pl.col("neg_per_sentence").sum(),
    pl.col("neu_per_sentence").sum()
]).sort("year")

# Plot
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_yearly_sum['year'], 
                        y=df_yearly_sum['pos_per_sentence'],
                        name='Positive'))

fig.add_trace(go.Scatter(x=df_yearly_sum['year'], 
                        y=df_yearly_sum['neg_per_sentence'],
                        name='Negative'))

fig.add_trace(go.Scatter(x=df_yearly_sum['year'], 
                        y=df_yearly_sum['neu_per_sentence'],
                        name='Neutral'))

fig.update_layout(
    title='Sentiment Analyse pro Jahr',
    xaxis_title='Jahr',
    yaxis_title='Summe der Sentiments',
    legend_title='Sentiment Typ'
)

fig.show()


In [None]:
import polars as pl
df_parla_top = pl.read_parquet("top.parquet")

In [None]:
speeches = df_parla_top.select("speechContent").to_series().to_list()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from tqdm import tqdm


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

nlp = spacy.load('de_core_news_md')

def clean_text(text):
    """
    Bereinigt einen Text durch verschiedene Vorverarbeitungsschritte
    """
    text = text.lower()
    
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def remove_stopwords(text, custom_stopwords=None):
    """
    Entfernt Stoppwörter aus dem Text
    """
    stop_words = set(stopwords.words('german'))
    
    if custom_stopwords:
        stop_words.update(custom_stopwords)
    
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    return ' '.join(filtered_text)

def lemmatize_text(text):
    """
    Führt Lemmatisierung durch (Grundform der Wörter)
    """
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    return lemmatized_text

def preprocess_political_speeches(speeches, custom_stopwords=None):
    processed_speeches = []
    
    for speech in tqdm(speeches, desc="Processing speeches", unit="speech"):
        cleaned_text = clean_text(speech)
        
        text_without_stopwords = remove_stopwords(cleaned_text, custom_stopwords)
        
        lemmatized_text = lemmatize_text(text_without_stopwords)
        
        processed_speeches.append(lemmatized_text)
    
    return processed_speeches


In [None]:
processed_speeches = preprocess_political_speeches(speeches)


In [None]:
from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(processed_speeches)

language

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic(3)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

In [None]:
similar_topics, similarity = topic_model.find_topics("bundeswehr", top_n=5)
topic_model.get_topic(similar_topics[0])


In [None]:
#topic_model.save("topic_model", serialization="pickle")

In [None]:
import polars as pl
df = pl.read_parquet("top_cleanded_Sentences.parquet")

In [None]:
df

In [None]:
from bertopic import BERTopic


loaded_model = BERTopic.load("topic_model")

In [None]:
import polars as pl
df = pl.read_parquet("top_cleanded_Sentences.parquet")

In [None]:
df.filter(pl.col("cleanded_sentence").len() > 20)

In [None]:
loaded_model.get_topic_info()

In [None]:
df = df.with_columns(
    pl.col("date").str.strptime(pl.Date, "%Y-%m-%d").alias("date_parsed")
)

In [None]:
import numpy as np
timestamps = df.select("date_parsed").to_series().to_list()
timestamps_np = np.array(timestamps, dtype='datetime64')

In [None]:
topics_over_time = loaded_model.topics_over_time(df.select("cleanded_sentence").to_series().to_list(), timestamps_np, nr_bins=72)

In [None]:
df_timon = pl.read_parquet("processed_bundestags_daten.parquet")

In [None]:
df_timon

In [None]:
df_yearly_sum = df_timon.group_by("year").agg([
    pl.col("pos").sum(),
    pl.col("neg_per_sentence").sum(),
    pl.col("neu_per_sentence").sum()
]).sort("year")

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_yearly_sum['year'], 
                        y=df_yearly_sum['pos_per_sentence'],
                        name='Positive'))

fig.add_trace(go.Scatter(x=df_yearly_sum['year'], 
                        y=df_yearly_sum['neg_per_sentence'],
                        name='Negative'))

fig.add_trace(go.Scatter(x=df_yearly_sum['year'], 
                        y=df_yearly_sum['neu_per_sentence'],
                        name='Neutral'))

fig.update_layout(
    title='Sentiment Analyse pro Jahr',
    xaxis_title='Jahr',
    yaxis_title='Summe der Sentiments',
    legend_title='Sentiment Typ'
)

fig.show()
