Arthur

In [2]:
import spacy
import lftk
import pandas as pd
import plotly.express as px
from scipy.stats import pearsonr
nlp = spacy.load("en_core_web_sm")
import os

In [2]:
# Iterate throigh the list with the text
def count_pos_tags(text):
    doc = nlp(text)

    # initiate LFTK extractor by passing in doc
    # you can pass in a list of multiple docs
    LFTK = lftk.Extractor(docs = doc)

    # optionally, you can customize how LFTK extractor calculates handcrafted linguistic features
    # for example, include stop word? include puncutaion? maximum decimal digits?
    LFTK.customize(round_decimal=3)

    # now, extract the handcrafted linguistic features that you need
    # refer to them as feature keys
    extracted_features = LFTK.extract(features = ["a_kup_pw","a_kup_ps","t_kup","t_char", "t_uword", "a_word_ps", "n_unoun", "n_uverb", "n_uadj", "t_n_ent", "simp_ttr", "t_punct", "t_stopword", "fkre", "fkgl", "fogi", "smog", "cole", "auto", "rt_fast", "rt_average", "rt_slow"])

    return extracted_features

rename_dict = {"t_char":"total_char", "t_uword": "unique_words", "a_word_ps":"average_words_sentence", "n_unoun":"unique_nouns", "n_uverb":"unique_verbs", "n_uadj":"unique_adj","t_n_ent":"total_entities" ,"simp_ttr":"type_token_ratio", "t_punct": "total_punctuation", "t_stopword":"total_stopwords" }

In [3]:
def renamer(data: list, column_mapping: dict) -> list:
    """Renames columns within dictionaries inside a list based on a provided mapping.

    Args:
        data: The list of dictionaries containing the data.
        column_mapping: A dictionary where keys are existing column names, and values are the new desired column names.

    Returns:
        A new list of dictionaries with the columns renamed.
    """
    renamed_data = []
    for item in data:
        renamed_item = {
            column_mapping.get(key, key): value for key, value in item.items()
        }
        renamed_data.append(renamed_item)
    return renamed_data

In [4]:
obj = pd.read_pickle("./export/gold_160.pkl")
df = pd.DataFrame(obj).T
df.rename(columns={0: "Score", 1: "Text"}, inplace=True)

data = [count_pos_tags(i) | {"Text": i} for i in df["Text"]] # Die Funktion von count_pos_tags wird auf text-liste angewendet
data = renamer(data, rename_dict) # Spalten werden umbenannt

data_df = pd.DataFrame(data) 
df = pd.merge(df, data_df, on="Text") # DataFrame merge
df["Nr"] = df.index.astype(str) # Kategorisieren der x-Achse

In [5]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def topResult(metric, num:int):
    fig = px.bar(metric.sort_values(by='Score', ascending=False).head(num), x='Nr', y='Score', 
                title=f'Top 3 Topics by Score', text='Score',
                    width=800, height=400)
    fig.update_layout(
        xaxis_title='Topic',
        xaxis_type='category',
        yaxis_title=f'Score',
    )

    fig.show()

In [6]:
topResult(df,10)

In [7]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def nomCorrelation(metric):
    fig = px.scatter(metric, x='unique_nouns', y='Score', title=f'Correlation between amount of nouns and Score',
                     # use Ordinary Least Squares regression to show
                     trendline="ols", trendline_scope='overall', color="Nr",
                     width=800, height=400)

    # measure the Pearson Correlation between those two data points
    correlation, p_value = pearsonr(metric['unique_nouns'], metric['Score'])

    print(f'The Pearson correlation coefficient between Score and Nomen is: {correlation:.3f}, p: {p_value:.3f}.')

    # dont show the legend, beacuse they are just all the topics
    fig.update_layout(yaxis_title=f'Score', showlegend=False)
    fig.show()

In [8]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def verbCorrelation(metric):
    fig = px.scatter(metric, x='unique_verbs', y='Score', title=f'Correlation between amount of verbs and Score',
                     # use Ordinary Least Squares regression to show
                     trendline="ols", trendline_scope='overall', color="Nr",
                     width=800, height=400)

    # measure the Pearson Correlation between those two data points
    correlation, p_value = pearsonr(metric['unique_verbs'], metric['Score'])

    print(f'The Pearson correlation coefficient between Score and Verben is: {correlation:.3f}, p: {p_value:.3f}.')

    # dont show the legend, beacuse they are just all the topics
    fig.update_layout(yaxis_title=f'Score', showlegend=False)
    fig.show()

In [9]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def fkreCorrelation(metric):
    fig = px.scatter(metric, x='fkre', y='Score', title=f'Correlation between amount of fkre and Score',
                     # use Ordinary Least Squares regression to show
                     trendline="ols", trendline_scope='overall', color="Nr",
                     width=800, height=400)

    # measure the Pearson Correlation between those two data points
    correlation, p_value = pearsonr(metric['fkre'], metric['Score'])

    print(f'The Pearson correlation coefficient between Score and FKRE is: {correlation:.3f}, p: {p_value:.5f}.')

    # dont show the legend, beacuse they are just all the topics
    fig.update_layout(yaxis_title=f'Score', showlegend=False)
    fig.show()

In [10]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def fkglCorrelation(metric):
    fig = px.scatter(metric, x='fkgl', y='Score', title=f'Correlation between amount of fkgl and Score',
                     # use Ordinary Least Squares regression to show
                     trendline="ols", trendline_scope='overall', color="Nr",
                     width=800, height=400)

    # measure the Pearson Correlation between those two data points
    correlation, p_value = pearsonr(metric['fkgl'], metric['Score'])

    print(f'The Pearson correlation coefficient between Score and FKGL is: {correlation:.3f}, p: {p_value:.3f}.')

    # dont show the legend, beacuse they are just all the topics
    fig.update_layout(yaxis_title=f'Score', showlegend=False)
    fig.show()

In [11]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def fogiCorrelation(metric):
    fig = px.scatter(metric, x='fogi', y='Score', title=f'Correlation between amount of fogi and Score',
                     # use Ordinary Least Squares regression to show
                     trendline="ols", trendline_scope='overall', color="Nr",
                     width=800, height=400)

    # measure the Pearson Correlation between those two data points
    correlation, p_value = pearsonr(metric['fogi'], metric['Score'])

    print(f'The Pearson correlation coefficient between Score and FOGI is: {correlation:.3f}, p: {p_value:.3f}.')

    # dont show the legend, beacuse they are just all the topics
    fig.update_layout(yaxis_title=f'Score', showlegend=False)
    fig.show()

In [13]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def smogCorrelation(metric):
    fig = px.scatter(metric, x='smog', y='Score', title=f'Correlation between amount of smog and Score',
                     # use Ordinary Least Squares regression to show
                     trendline="ols", trendline_scope='overall', color="Nr",
                     width=800, height=400)

    # measure the Pearson Correlation between those two data points
    correlation, p_value = pearsonr(metric['smog'], metric['Score'])

    print(f'The Pearson correlation coefficient between Score and SMOG is: {correlation:.3f}, p: {p_value:.3f}.')

    # dont show the legend, beacuse they are just all the topics
    fig.update_layout(yaxis_title=f'Score', showlegend=False)
    fig.show()

In [24]:
# Diese Funktion gibt eine Bar Chart aus mit den Top x Ergebnissen aus
def aoaCorrelation2(metric):
    fig = px.scatter(metric, x='t_kup', y='Score', title=f'Correlation between Age of Acquisation and Score',
                     # use Ordinary Least Squares regression to show
                     trendline="ols", trendline_scope='overall', color="Nr",
                     width=800, height=400)

    # measure the Pearson Correlation between those two data points
    correlation, p_value = pearsonr(metric['t_kup'], metric['Score'])
    print(f'The Pearson correlation coefficient between Score and Nomen is: {correlation:.3f}, p: {p_value:.3f}.')

    # dont show the legend, beacuse they are just all the topics
    fig.update_layout(yaxis_title=f'Score', showlegend=False)
    fig.show()

In [15]:
fkreCorrelation(df)

The Pearson correlation coefficient between Score and Nomen is: 0.483, p: 0.00000.


In [16]:
fkglCorrelation(df)

The Pearson correlation coefficient between Score and Nomen is: -0.614, p: 0.000.


In [17]:
fogiCorrelation(df)

The Pearson correlation coefficient between Score and Nomen is: -0.571, p: 0.000.


In [18]:
smogCorrelation(df)

The Pearson correlation coefficient between Score and Nomen is: -0.351, p: 0.000.


In [19]:
nomCorrelation(df)

The Pearson correlation coefficient between Score and Nomen is: -0.313, p: 0.000.


In [20]:
verbCorrelation(df)

The Pearson correlation coefficient between Score and Nomen is: -0.389, p: 0.000.


In [22]:
aoaCorrelation(df)

The Pearson correlation coefficient between Score and Nomen is: -0.632, p: 0.000.


In [25]:
aoaCorrelation2(df)

The Pearson correlation coefficient between Score and Nomen is: -0.640, p: 0.000.
