perform topic modeling on a small dataset in Python, you can use the Latent Dirichlet Allocation (LDA) algorithm from the gensim library. LDA is a probabilistic model that assumes that each document in a dataset is a mixture of a small number of topics, and that each word in the document is generated from one of those topics.

In [None]:
import csv, pandas as pd
from gensim import corpora
import re

## Reinigung

In [None]:
# Open the CSV file and read the sentences into a list
scenario = pd.read_excel('human_finetuned_gpt3_scenarios.xlsx')
#scenario = scenario.drop('prompt', axis = 1)

In [None]:
scenario.dropna(subset=['scenario'], inplace=True)

In [None]:
scenario['scenario'] = \
scenario['scenario'].map(lambda x: x.lower())
scenario['scenario'].head(1)

In [None]:
def split_sentence(sentence):
    words = sentence.split()
    unique_words = list(set(words))
    return unique_words

In [None]:
scenario['scenario_singlewords'] = scenario['scenario'].apply(split_sentence)

In [None]:
scenario['scenario_singlewords'].head(1)

In [None]:
# Convert the lists in the scenario_singlewords column to tuples
scenario['scenario_singlewords'] = scenario['scenario_singlewords'].apply(tuple)

In [None]:
scenario['scenario_singlewords'] = scenario['scenario_singlewords'].apply(lambda x: [re.sub(r'[^\w\s]+', '', i) for i in x])

In [None]:
scenario

## Stopwords und Synonyme filtern

In [None]:
import gensim
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np

In [None]:
# Load English stopwords
stop_words = set(stopwords.words('english'))

# Add custom stopwords
custom_stop_words = ['towards','eu','european','europe','still','scenario','increase','increased','due','promoting','fostering','becomes','advanced','reducing','ha', 'use', 'rise', 'led', 'lead', 'form', 'new', 'enable', 'continue', 'widespread']
stop_words = stop_words.union(custom_stop_words) 

In [None]:
# Create a WordNetLemmatizer object
lemmatizer = WordNetLemmatizer()

# Lemmatize each word in the list of single words
scenario['scenario_singlewords'] = [[lemmatizer.lemmatize(word) for word in word_list] for word_list in scenario['scenario_singlewords']]

In [None]:
# Combine all the single words into a single list
all_words = [word for word_list in scenario['scenario_singlewords'] for word in word_list]
# Filter out stopwords
all_words = [word for word in all_words if word not in stop_words]

In [None]:
# Filter out stopwords from the list of single words
scenario['scenario_singlewords'] = [[word for word in word_list if word not in stop_words] for word_list in scenario['scenario_singlewords']]

In [None]:
scenario['scenario_singlewords']

In [None]:
unique_words = set(all_words)
count_unique_words = len(unique_words)
print(count_unique_words)

## Visualisieren

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
word_counts = Counter(all_words).most_common(20)
# Separate the words and their frequencies into separate lists
words, frequencies = zip(*word_counts)
words = list(words)
frequencies = list(frequencies)

# Set seaborn style and plot the bar chart
sns.set(style="white", font='monospace')
plt.figure(figsize=(12,8), dpi=300)
sns.barplot(x=frequencies, y=words, color='black') #45464c #192633 #7a96b3

# Add the frequency values as text
for i, v in enumerate(frequencies):
    plt.text(v + 0.2, i, str(v), color='black')

# Set the x-axis limit
plt.xlim(0, max(frequencies) + 10)

# Set the title and axis labels
plt.title('20 Most Used Terms in GPT-3-Made Scenarios', fontsize = 16)
plt.xlabel('Frequency')
plt.ylabel('Words  (N = 2813)')

# Remove spines
sns.despine(trim=True)

plt.show()

# Sentiment Analyse

In [None]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from huggingface_hub import notebook_login
from transformers import AutoTokenizer
from transformers import pipeline
from datasets import load_metric
from tqdm import tqdm

In [None]:
unique_words=list(unique_words)
#unique_words

In [None]:
classifier = pipeline("sentiment-analysis",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
prediction = classifier(unique_words)
print(prediction)

In [None]:
scenario['year']=scenario['year'].astype(int)

In [None]:
df = pd.DataFrame(prediction)
df.head()

In [None]:
def transform_df(df):
    labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    new_df = pd.DataFrame(columns=labels)
    for i in range(df.shape[0]):
        row = {}
        for j in range(df.shape[1]):
            label = df.iloc[i, j]['label']
            score = df.iloc[i, j]['score']
            row[label] = score
        new_df = new_df.append(row, ignore_index=True)
    return new_df

new_df = transform_df(df)

In [None]:
new_df.head()

In [None]:
ft_emotions = pd.concat([scenario['year'],scenario['scenario'], new_df['sadness'], new_df['joy'], new_df['love'], new_df['anger'],new_df['fear'], new_df['surprise']], axis=1)
ft_emotions.head()

In [None]:
ft_emotions = ss_emotions.dropna()
ft_emotions

In [None]:
#ft_emotions.to_csv('gpt3_scenarios_sentiment_analysis_full.csv', index = True)

## Visualize

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import matplotlib as mpl

In [None]:
def visualize_grouped_sentiments(ft_emotions):
    # Melt the data frame to have sentiment labels as columns
    df_melted = pd.melt(ft_emotions, id_vars=["scenario", "year"], value_vars=["sadness", "joy", "love", "anger", "fear", "surprise"])
    
    # Pivot the data so that each sentiment label is a column and each group is a row
    pivot_df = df_melted.pivot_table(index='year', columns='variable', values='value', aggfunc='mean')
    
    # Re-order the columns in the pivot table
    pivot_df = pivot_df[["anger", "fear", "sadness", "surprise", "joy", "love"]]
    
    # Set the color palette to RdBu
    sns.set_palette("Greys_r", n_colors=6)
    
    # Set the style to whitegrid
    sns.set_style("white")
    
    # Set the font family to monospace
    mpl.rcParams['font.family'] = 'monospace'
    
    # Plot the stacked bar plot
    fig, ax = plt.subplots(figsize=(12, 9), dpi=300)
    pivot_df.plot(kind='bar', stacked=True, ax=ax)
    
    # Add labels and titles
    ax.set_title("Sentiment Scores per Year: GPT-3-Made Scenarios", fontsize=16)
    ax.set_xlabel("Scenario of the Year", fontsize=12)
    ax.set_ylabel("Mean Sentiment Score per Emotion", fontsize=12)
    ax.legend(title='Sentiment Label', fontsize=6)
    ax.tick_params(axis='both', which='major', labelsize=12)
    
    # Remove spines
    sns.despine(trim=True)
    
    plt.show()
    
visualize_grouped_sentiments(ft_emotions)

## Comparing both, Human-Made and Machine-Made Scenarios

In [None]:
hs_emotions = pd.read_csv('../Szenario/humanmade_scenarios_sentiment_analysis_full.csv')

In [None]:
hs_emotions = hs_emotions.drop(hs_emotions[(hs_emotions.year == 2035) | (hs_emotions.year == 2045)].index)

In [None]:
hs_emotions = hs_emotions.dropna()
hs_emotions = hs_emotions.drop('Unnamed: 0', axis = 1)
hs_emotions.head()

In [None]:
s35_emotions = pd.read_csv('../Szenario/gpt35_scenarios_sentiment_analysis_full.csv')

In [None]:
s35_emotions = s35_emotions.dropna()
s35_emotions = s35_emotions.drop('Unnamed: 0', axis = 1)
s35_emotions.head()

In [None]:
s4_emotions = pd.read_csv('../Szenario/gpt4_scenarios_sentiment_analysis_full.csv')

In [None]:
s4_emotions = s4_emotions.dropna()
s4_emotions = s4_emotions.drop('Unnamed: 0', axis = 1)
s4_emotions.head()

In [None]:
def visualize_grouped_sentiments(hs_emotions, ft_emotions, s35_emotions, s4_emotions):
    # Melt the data frame to have sentiment labels as columns
    df_melted1 = pd.melt(hs_emotions, id_vars=["scenario", "year"], value_vars=["sadness", "joy", "love", "anger", "fear", "surprise"])
    df_melted2 = pd.melt(ft_emotions, id_vars=["scenario", "year"], value_vars=["sadness", "joy", "love", "anger", "fear", "surprise"])
    df_melted3 = pd.melt(s35_emotions, id_vars=["scenario", "year"], value_vars=["sadness", "joy", "love", "anger", "fear", "surprise"])
    df_melted4 = pd.melt(s4_emotions, id_vars=["scenario", "year"], value_vars=["sadness", "joy", "love", "anger", "fear", "surprise"])
    
    # Pivot the data so that each sentiment label is a column and each group is a row
    pivot_df1 = df_melted1.pivot_table(index='year', columns='variable', values='value', aggfunc='mean')
    pivot_df2 = df_melted2.pivot_table(index='year', columns='variable', values='value', aggfunc='mean')
    pivot_df3 = df_melted3.pivot_table(index='year', columns='variable', values='value', aggfunc='mean')
    pivot_df4 = df_melted4.pivot_table(index='year', columns='variable', values='value', aggfunc='mean')
    
    # Re-order the columns in the pivot table
    pivot_df1 = pivot_df1[["anger", "fear", "sadness", "surprise", "joy", "love"]]    
    pivot_df2 = pivot_df2[["anger", "fear", "sadness", "surprise", "joy", "love"]]   
    pivot_df3 = pivot_df3[["anger", "fear", "sadness", "surprise", "joy", "love"]]
    pivot_df4 = pivot_df4[["anger", "fear", "sadness", "surprise", "joy", "love"]]
    
    # Concatenate both pivot tables along the index, and label each data source
    pivot_df = pd.concat([pivot_df1, pivot_df2, pivot_df3, pivot_df4], axis=0, keys=["Human", "GPT-3", "GPT-3.5", "GPT-4"])
    
    # Set the color palette to RdBu
    sns.set_palette("Greys_r", n_colors=6)
    
    # Plot the stacked bar plot
    fig, ax = plt.subplots(figsize=(10, 8), dpi=300)
    pivot_df.plot(kind='bar', stacked=True, ax=ax)
    
    # Add labels and titles
    ax.set_title("Sentiment Scores per Year: Human- vs. Machine-Made Scenarios", fontsize=16)
    ax.set_xlabel("Year", fontsize=12)
    ax.set_ylabel("Mean Sentiment Score per Emotion", fontsize=10)
    ax.legend(title='Sentiment Label', fontsize=10)
    ax.tick_params(axis='both', which='major', labelsize=12)
    
    # Remove spines
    sns.despine(trim=True)
    ax.set_xticklabels([label.get_text().replace("(", "").replace(")", "").split('.')[0] for label in ax.get_xticklabels()], rotation=45, ha='right')
    plt.show()

In [None]:
visualize_grouped_sentiments(hs_emotions, ft_emotions, s35_emotions, s4_emotions)

## T-Test per Sentiment

In [None]:
from scipy.stats import ttest_ind

In [None]:
hs_sadness = hs_emotions['sadness']
ft_sadness = ft_emotions['sadness']

t_stat, p_val = ttest_ind(hs_sadness, ft_sadness, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_sadness = hs_emotions['sadness']
ss_sadness = ss_emotions['sadness']

t_stat, p_val = ttest_ind(hs_sadness, ss_sadness, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_sadness = hs_emotions['sadness']
s35_sadness = s35_emotions['sadness']

t_stat, p_val = ttest_ind(hs_sadness, s35_sadness, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_fear = hs_emotions['fear']
ss_fear = ss_emotions['fear']

t_stat, p_val = ttest_ind(hs_fear, ss_fear, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_fear = hs_emotions['fear']
ft_fear = ft_emotions['fear']

t_stat, p_val = ttest_ind(hs_fear, ft_fear, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_fear = hs_emotions['fear']
s35_fear = s35_emotions['fear']

t_stat, p_val = ttest_ind(hs_fear, s35_fear, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_anger = hs_emotions['anger']
ss_anger = ss_emotions['anger']

t_stat, p_val = ttest_ind(hs_anger, ss_anger, equal_var=False)

print("P-value: ", p_val)

P-value:  0.00000005174194068414625

In [None]:
hs_anger = hs_emotions['anger']
ft_anger = ft_emotions['anger']

t_stat, p_val = ttest_ind(hs_anger, ft_anger, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_anger = hs_emotions['anger']
ss_anger = ss_emotions['anger']

t_stat, p_val = ttest_ind(hs_anger, ss_anger, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_surprise = hs_emotions['surprise']
ss_surprise = ss_emotions['surprise']

t_stat, p_val = ttest_ind(hs_surprise, ss_surprise, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_surprise = hs_emotions['surprise']
ft_surprise = ft_emotions['surprise']

t_stat, p_val = ttest_ind(hs_surprise, ft_surprise, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_surprise = hs_emotions['surprise']
s35_surprise = s35_emotions['surprise']

t_stat, p_val = ttest_ind(hs_surprise, s35_surprise, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_joy = hs_emotions['joy']
ft_joy = ft_emotions['joy']

t_stat, p_val = ttest_ind(hs_joy, ft_joy, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_joy = hs_emotions['joy']
ss_joy = ss_emotions['joy']

t_stat, p_val = ttest_ind(hs_joy, ss_joy, equal_var=False)

print("P-value: ", p_val)

P-value:  0.0000000000006897228642655168

In [None]:
hs_joy = hs_emotions['joy']
s35_joy = s35_emotions['joy']

t_stat, p_val = ttest_ind(hs_joy, s35_joy, equal_var=False)

print("P-value: ", p_val)

P-value:  0.000010639088602384985

In [None]:
hs_love = hs_emotions['love']
ss_love = ss_emotions['love']

t_stat, p_val = ttest_ind(hs_love, ss_love, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_love = hs_emotions['love']
s35_love = s35_emotions['love']

t_stat, p_val = ttest_ind(hs_love, s35_love, equal_var=False)

print("P-value: ", p_val)

In [None]:
hs_love = hs_emotions['love']
ft_love = ft_emotions['love']

t_stat, p_val = ttest_ind(hs_love, ft_love, equal_var=False)

print("P-value: ", p_val)

# LDA

Now that I have a trained model let’s visualize the topics for interpretability. To do so, 
I’ll use a popular visualization package, pyLDAvis which is designed to help interactively with:

you can manually select each topic to view its top most frequent and/or “relevant” terms, using 
different values of the λ parameter. This can help when you’re trying to assign a human 
interpretable name or “meaning” to each topic'

In [None]:
import gensim.corpora as corpora
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
titles = list(scenario['scenario_singlewords'])

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]

data_words = remove_stopwords(titles)

print(data_words[:1][0][-1])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][-1])

In [None]:
from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [None]:
vis = gensimvis.prepare(lda_model, corpus, id2word, mds="mmds", R=20)
vis

## Keyword Extraktion

In [None]:
from rake_nltk import Rake
import itertools

In [None]:
flat_list = list(itertools.chain.from_iterable(titles))

In [None]:
text = ', '.join(flat_list)

In [None]:
r = Rake()
r.extract_keywords_from_text(text)
keywords = r.get_ranked_phrases()
word_counts = Counter(keywords)
print(word_counts)