In [1]:
#Import der Bibliotheken & des spacy-Packets
import pandas as pd
import re
import spacy
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA, TruncatedSVD
from collections import Counter

nlp = spacy.load('en_core_web_sm')

In [2]:
#Kontrolle und Anpassen der Stoppwörter:
print("Original Stoppwörter:")
print(nlp.Defaults.stop_words)

Original Stoppwörter:
{'two', 'empty', 'eleven', 'done', 'often', 'seems', 'did', 'up', 'upon', 'serious', 'forty', 'thereby', 'always', 'behind', 'whereafter', 'on', 'than', 'themselves', 'via', 'less', 'were', 'least', 'whom', 'whereby', 'over', 'noone', "'d", 'various', 'part', 'even', 'take', 'per', 'at', 'anything', 'becomes', 'where', 'own', 'last', 'either', 'since', 'whereas', 'your', 'both', 'there', 'indeed', 'does', 'of', '‘re', 'front', 'same', 'four', 'twenty', 'would', 'after', 'been', 'go', 'should', 'nevertheless', 'no', 'full', 'hereupon', 'those', 'whatever', 'between', 'none', 'anyway', 'thereupon', 'beforehand', 'whether', 'others', 'us', 'an', 'to', 'nine', 'beyond', 'it', 'regarding', "n't", 'whoever', 'every', 'mine', 'within', 'onto', 'whereupon', 'well', 'nor', 'its', 'everyone', 'what', 'meanwhile', '’ll', 'very', 'as', '’re', 'amount', 'ours', 'may', 'why', 'top', 'the', 'however', 'will', "'re", 'him', 'used', 'ourselves', 'further', 'also', 'made', 'across'

In [3]:
additional_stop_words = {'card', 'sandisk', 'phone'}
nlp.Defaults.stop_words |= additional_stop_words

remove_stop_words = {'no', 'not'}
nlp.Defaults.stop_words -= remove_stop_words

print("Stoppwörter:")
print(nlp.Defaults.stop_words)

Stoppwörter:
{'two', 'empty', 'eleven', 'done', 'often', 'seems', 'did', 'up', 'upon', 'serious', 'forty', 'thereby', 'always', 'behind', 'whereafter', 'on', 'than', 'themselves', 'via', 'less', 'were', 'least', 'whom', 'whereby', 'over', 'noone', "'d", 'various', 'part', 'even', 'take', 'per', 'at', 'anything', 'becomes', 'where', 'own', 'last', 'either', 'since', 'whereas', 'your', 'both', 'there', 'indeed', 'does', 'of', '‘re', 'front', 'same', 'four', 'twenty', 'would', 'after', 'been', 'go', 'should', 'nevertheless', 'full', 'hereupon', 'those', 'whatever', 'between', 'none', 'anyway', 'thereupon', 'beforehand', 'whether', 'others', 'us', 'an', 'to', 'nine', 'beyond', 'it', 'regarding', "n't", 'whoever', 'every', 'mine', 'within', 'onto', 'whereupon', 'well', 'nor', 'its', 'everyone', 'what', 'meanwhile', '’ll', 'very', 'as', '’re', 'amount', 'ours', 'may', 'why', 'top', 'the', 'however', 'will', "'re", 'him', 'used', 'ourselves', 'further', 'also', 'made', 'sandisk', 'across', 'h

In [4]:
#Vorverarbeitung des Textes:
def preprocess_text(text):
    #Textformat muss ein String sein:
    if not isinstance(text, str):
        text = ""
    #Sonderzeichen entfernen:
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    #Umwandlung in Kleinbuchstaben: 
    doc = nlp(text.lower())
    #Stoppwörter entfernen und Lemmatisierung:
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

In [5]:
#CSV-Datei einlesen & ausführen der definierten Vorverarbeitung:
csv_datei = 'amazon_reviews.csv'

try:
    df = pd.read_csv(csv_datei)
    print("Einlesen erfolgreich.")
except FileNotFoundError:
    print(f"Fehler: Datei unter '{csv_datei}' nicht gefunden.")
except pd.errors.ParserError:
    print("Fehler beim Parsen der CSV-Datei. Bitte überprüfen Sie das Dateiformat.")
except Exception as e:
    print(f"Ein unerwarteter Fehler ist aufgetreten: {e}")
    
if 'df' in locals():
    #Auswahl der Spalte 'reviewText'
    if 'reviewText' in df.columns:
        #Überprüfen und Bereinigen der Daten:
        df['reviewText'] = df['reviewText'].astype(str)
        df['cleaned_reviewText'] = df['reviewText'].apply(preprocess_text)
    else:
        print("Fehler")

Einlesen erfolgreich.


In [6]:
#Implementierung der BoW-Methode mittels scikit-Learn:
if 'cleaned_reviewText' in df.columns:
    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transform(df['cleaned_reviewText'])
else:
    print("Fehler")

In [7]:
#Implementierung der BoW-Methode mittels numpy:
def bow_with_numpy(texts):
    #Erstellen des Vokabulars
    vocab = list(set(" ".join(texts).split()))
    vocab.sort()
    vocab_dict = {word: idx for idx, word in enumerate(vocab)}
    
    #Erstellen der BoW-Matrix
    bow_matrix = np.zeros((len(texts), len(vocab)))
    for i, text in enumerate(texts):
        word_count = Counter(text.split())
        for word, count in word_count.items():
            if word in vocab_dict:
                bow_matrix[i, vocab_dict[word]] = count
    return bow_matrix, vocab

if 'cleaned_reviewText' in df.columns:
    bow_matrix_np, vocab_np = bow_with_numpy(df['cleaned_reviewText'])
else:
    print("Fehler")

In [8]:
#Anzeigen der beiden erstellten Vektoren:
#scikit-Learn:
if 'X_bow' in locals():
    print("BoW-Vektoren (scikit-learn):")
    print(X_bow.toarray())
else:
    print("Fehler")

#numpy:
if 'bow_matrix_np' in locals():
    print("BoW-Vektoren (numpy):")
    print(bow_matrix_np)
else:
    print("Fehler")

BoW-Vektoren (scikit-learn):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
BoW-Vektoren (numpy):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
#Implementierung der Tf-idf-Methode:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['reviewText'])

In [10]:
#Funktion zum Anzeigen der Themen:
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        topic_words = [words[i] for i in topic.argsort()[:-top_n - 1:-1] if i < len(words)]
        print(" ".join(topic_words)) 

#Häufigste Themen mittels LSA:
n_topics = 5

if 'tfidf_matrix' in locals():
    lsa_model = TruncatedSVD(n_components=n_topics, random_state=42)
    lsa_topic_matrix = lsa_model.fit_transform(tfidf_matrix)

    print("LSA Topics:")
    print_topics(lsa_model, vectorizer)
    
#Häufigste Themen mittels LDA:
    lda_model = LDA(n_components=n_topics, random_state=42)
    lda_topic_matrix = lda_model.fit_transform(tfidf_matrix)

    print("LDA Topics:")
    print_topics(lda_model, vectorizer)
else:
    print("Fehler")

LSA Topics:
Topic 0:
videoi mobiledelivery card wav photographic defect vivotab immense mistake maxx
Topic 1:
job photographic mobiledelivery plier intentionally card immense spend rez
Topic 2:
mobiledelivery st wav formatea dd fore policythe unneeded
Topic 3:
mistake itndeliver retch job splurge st rightyou
Topic 4:
plier kit mo rez hide laptoptransfer thetranscend mobiledelivery cardonce
LDA Topics:
Topic 0:
limitenjoy readily minicamcorder computation sandiskfrom asus succede outgrow computerdont
Topic 1:
mobiledelivery videoi card wav photographic vivotab defect immense maxx prepard
Topic 2:
kitten consumer company foray predictably mbsthe issuesnot heyfer benchmarksetting insert
Topic 3:
videoi mobiledelivery card mistake immense splurge defect vivotab photographic
Topic 4:
gti bend expound gate ui experiment daughter cect waswhile


In [11]:
#Funktion zum Zuweisen der Themen zu den Dokumenten:
def assign_topics(topic_matrix, n_top_documents=5):
    assignments = {}
    for topic_idx, topic in enumerate(topic_matrix.T):
        assignments[f'Topic {topic_idx}'] = []
        top_document_indices = topic.argsort()[:-n_top_documents - 1:-1]
        for doc_index in top_document_indices:
            assignments[f'Topic {topic_idx}'].append(df['cleaned_reviewText'].iloc[doc_index])
    return assignments

#Funktion zum Erstellen eines DataFrames:
def create_topic_dataframe(model, vectorizer, n_top_words=10):
    words = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1] if i < len(words)]
        topics.append(f"Topic {topic_idx + 1}: " + ", ".join(topic_words))
    return pd.DataFrame(topics, columns=["Top Words"])

if 'lsa_topic_matrix' in locals() and 'lda_topic_matrix' in locals():
    #Dokumentzuweisungen für LSA:
    print("Dokumentenzuweisungen für LSA:")
    lsa_assignments = assign_topics(lsa_topic_matrix)
    lsa_assignments_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in lsa_assignments.items()]))
    display(lsa_assignments_df)

    #Dokumentzuweisungen für LDA:
    print("Dokumentenzuweisungen für LDA:")
    lda_assignments = assign_topics(lda_topic_matrix)
    lda_assignments_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in lda_assignments.items()]))
    display(lda_assignments_df)

    #DataFrames für die Themen:
    lsa_topics_df = create_topic_dataframe(lsa_model, vectorizer)
    lda_topics_df = create_topic_dataframe(lda_model, vectorizer)

    print("LSA Topics:")
    display(lsa_topics_df)

    print("LDA Topics:")
    display(lda_topics_df)
else:
    print("Fehler")

Dokumentenzuweisungen für LSA:


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4
0,note read update scroll m leave review s...,work great,work perfect no complaint fast s buy say go,memory work price good size,no issue
1,space droid razr gb microsd slot figure d pick...,work great galaxy s,work like work like buy thinking work,capacity price not wrong m sure class recomm...,no issue
2,buy originally galaxy note ii intention primar...,galaxy s work great ve no problem highly rec...,buy samsung galaxy siii work fine suppose,work no problem good price brand sd buy,no problem
3,monday november galaxy s format restart show...,use samsung galaxy cell work great good va...,disk work happy,great price work great want storage reason...,buy multiple problem recognize far no problem
4,update lovely wife buy samsung galaxy tab...,use gopro hero camera work great no issue pe...,hard believe memory store small chip,work great,month no problem awesome thank


Dokumentenzuweisungen für LDA:


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4
0,este fue una buena compras ha sido todo lo que...,note read update scroll m leave review s...,comore esta memorka para completar mi nueva sa...,video recording device surveillance carbase dv...,es un producto que recomiendo ampliamente solo...
1,es una tarjeta micro guarda todas la caracteri...,get wife samsung galaxy tab christmas take ...,es una reconocida marca y la calidad de sus pr...,not reliable handle hd video gopro hero glit...,la uso sul mio samsung galaxy tab e va beni...
2,muy bueno realmente es rapido como dice mi gal...,buy originally galaxy note ii intention primar...,fast giv work great,ve try card go compact flash card rule single ...,oh gosh great invention slice bread total unde...
3,product expectedcrystaldiskmark x c hiyohi...,primary reason buy extra capacitysandisk ultra...,bouthg memory contry store sele good product b...,mobius action camera p hd mini sport camdash c...,
4,yes,announcement gb micro sd take internet storm p...,buy memory replace gb memory zte valey phonean...,format sdformatter pc test sony vaio cd sz v...,excelent


LSA Topics:


Unnamed: 0,Top Words
0,"Topic 1: videoi, mobiledelivery, card, wav, ph..."
1,"Topic 2: job, photographic, mobiledelivery, pl..."
2,"Topic 3: mobiledelivery, st, wav, formatea, dd..."
3,"Topic 4: mistake, itndeliver, retch, job, splu..."
4,"Topic 5: plier, kit, mo, rez, hide, laptoptran..."


LDA Topics:


Unnamed: 0,Top Words
0,"Topic 1: limitenjoy, readily, minicamcorder, c..."
1,"Topic 2: mobiledelivery, videoi, card, wav, ph..."
2,"Topic 3: kitten, consumer, company, foray, pre..."
3,"Topic 4: videoi, mobiledelivery, card, mistake..."
4,"Topic 5: gti, bend, expound, gate, ui, experim..."
