In [None]:
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import spacy

nlp = spacy.load('de_core_news_sm')

In [None]:
output_file_path = 'C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/output.csv'
df = pd.read_csv(output_file_path, sep=';')

In [None]:
df

In [None]:
# Entferne alle Zeilen mit mindestens einem NaN-Wert
df_cleaned = df.dropna()

In [None]:
df_cleaned

In [None]:
import plotly.figure_factory as ff
import plotly.io as pio

grouped_data = [df_cleaned[df_cleaned['subcorpus'] == group]['fear_z'].tolist() for group in df_cleaned['subcorpus'].unique()]
group_labels = df_cleaned['subcorpus'].unique().tolist()

fig = ff.create_distplot(grouped_data, group_labels, show_hist=False, show_rug=False)

fig.update_layout(
    title="Verteilungskurven der 'fear_z'-Werte nach Subkorpus",
    xaxis_title="fear_z-Werte",  # Beschriftung der x-Achse
    yaxis_title="Häufigkeit",  # Beschriftung der y-Achse
    title_x=0.5  # Zentriert den Titel
)

fig.show()

pio.write_image(fig, "C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/Grafiken/distplot.png")

# Substantive filtern

In [None]:
def filter_noun_batch(df, text_column):
    docs = nlp.pipe(df[text_column].tolist(), batch_size=1000)
    
    mask = [doc[0].pos_ == 'NOUN' for doc in docs]
    
    return df[mask].reset_index(drop=True)
    
df_nouns = filter_noun_batch(df_cleaned, 'wordLC')

In [None]:
df_nouns

In [None]:
grouped_data = [df_nouns[df_nouns['subcorpus'] == group]['fear_z'].tolist() for group in df_nouns['subcorpus'].unique()]
group_labels = df_nouns['subcorpus'].unique().tolist()

fig = ff.create_distplot(grouped_data, group_labels, show_hist=False, show_rug=False)

fig.show()

# Verben filtern

In [None]:
def filter_verbs_batch(df, text_column):
    # Batch-Verarbeitung mit spaCy
    docs = nlp.pipe(df[text_column].tolist(), batch_size=1000)
    
    mask = [doc[0].pos_ == 'VERB' for doc in docs]
    
    return df[mask].reset_index(drop=True)
    
df_verbs = filter_verbs_batch(df_cleaned, 'wordLC')

In [None]:
df_verbs

In [None]:
path = 'C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/only_verbs.csv'

df_verbs.to_csv(path, index=False, sep=';')

In [None]:
grouped_data = [df_verbs[df_verbs['subcorpus'] == group]['fear_z'].tolist() for group in df_verbs['subcorpus'].unique()]
group_labels = df_verbs['subcorpus'].unique().tolist()

fig = ff.create_distplot(grouped_data, group_labels, show_hist=False, show_rug=False)

fig.show()

# Adjektive filtern

In [None]:
def filter_adj_batch(df, text_column):
    docs = nlp.pipe(df[text_column].tolist(), batch_size=1000)
    
    mask = [doc[0].pos_ == 'ADJ' for doc in docs]
    
    return df[mask].reset_index(drop=True)
    
df_adj = filter_adj_batch(df_cleaned, 'wordLC')

In [None]:
df_adj

In [None]:
path = 'C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/only_adj.csv'

df_adj.to_csv(path, index=False, sep=';')

In [None]:
grouped_data = [df_adj[df_adj['subcorpus'] == group]['fear_z'].tolist() for group in df_adj['subcorpus'].unique()]
group_labels = df_adj['subcorpus'].unique().tolist()

fig = ff.create_distplot(grouped_data, group_labels, show_hist=False, show_rug=False)

fig.show()

# Verben entfernen

In [None]:
def filter_non_verbs_batch(df, text_column):
    docs = nlp.pipe(df[text_column].tolist(), batch_size=1000)
    
    mask = [doc[0].pos_ != 'VERB' for doc in docs]
    
    return df[mask].reset_index(drop=True)

df_verbs_removed = filter_non_verbs_batch(df_cleaned, 'wordLC')

In [None]:
df_verbs_removed

In [None]:
path = 'C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/without_verbs.csv'

df_verbs_removed.to_csv(path, index=False, sep=';')

In [None]:
grouped_data = [df_verbs_removed[df_verbs_removed['subcorpus'] == group]['fear_z'].tolist() for group in df_verbs_removed['subcorpus'].unique()]
group_labels = df_verbs_removed['subcorpus'].unique().tolist()

fig = ff.create_distplot(grouped_data, group_labels, show_hist=False, show_rug=False)

fig.show()

# Substantive entfernen

In [None]:
def filter_non_nouns_batch(df, text_column):
    docs = nlp.pipe(df[text_column].tolist(), batch_size=1000)
    
    mask = [doc[0].pos_ != 'NOUN' for doc in docs]
    
    return df[mask].reset_index(drop=True)

df_nouns_removed = filter_non_nouns_batch(df_cleaned, 'wordLC')

In [None]:
df_nouns_removed

In [None]:
path = 'C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/without_nouns.csv'

df_nouns_removed.to_csv(path, index=False, sep=';')

In [None]:
grouped_data = [df_nouns_removed[df_nouns_removed['subcorpus'] == group]['fear_z'].tolist() for group in df_nouns_removed['subcorpus'].unique()]
group_labels = df_nouns_removed['subcorpus'].unique().tolist()

fig = ff.create_distplot(grouped_data, group_labels, show_hist=False, show_rug=False)

fig.show()

# Adjektive entfernen

In [None]:
def filter_non_adj_batch(df, text_column):
    docs = nlp.pipe(df[text_column].tolist(), batch_size=1000)
    
    mask = [doc[0].pos_ != 'ADJ' for doc in docs]
    
    return df[mask].reset_index(drop=True)

df_adj_removed = filter_non_adj_batch(df_cleaned, 'wordLC')

In [None]:
df_adj_removed

In [None]:
path = 'C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/without_adj.csv'

df_adj_removed.to_csv(path, index=False, sep=';')

In [None]:
grouped_data = [df_adj_removed[df_adj_removed['subcorpus'] == group]['fear_z'].tolist() for group in df_adj_removed['subcorpus'].unique()]
group_labels = df_adj_removed['subcorpus'].unique().tolist()

fig = ff.create_distplot(grouped_data, group_labels, show_hist=False, show_rug=False)

fig.show()

# Nur Adjektive als Boxplot

In [None]:
input_file_path = "C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/only_adj.csv"
df = pd.read_csv(input_file_path, sep=';')

In [None]:
df

In [None]:
import plotly.express as px

In [None]:
df

In [None]:
fig = px.box(df, y="fear_z", color="subcorpus")
fig.show()

In [None]:
import plotly.graph_objects as go
import plotly.io as pio

subcorpora = df_cleaned['subcorpus'].unique()

fig = go.Figure()

for subcorpus in subcorpora:
    filtered_data = df_cleaned[df_cleaned['subcorpus'] == subcorpus]['fear_z']
    fig.add_trace(go.Box(x=filtered_data, name=subcorpus))

fig.update_layout(
    title={
        'text': "Boxplots der 'fear_z'-Werte für beide Korpora",
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="'fear_z'-Werte",
    yaxis_title="Subcorpus"
)
fig.show()
fig.write_image("C:/Users/jwieb/OneDrive - uni-bielefeld.de/Bachelorarbeit/Grafiken/boxplot.png")