In [1]:
#Installation der notwendigen Bibliotheken:
!pip install pandas spacy scikit-learn gensim numpy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 656.4 kB/s eta 0:00:20
      --------------------------------------- 0.2/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.5/12.8 MB 2.6 MB/s eta 0:00:05
     -- ------------------------------------- 0.8/12.8 MB 3.9 MB/s eta 0:00:04
     ---- ----------------------------------- 1.5/12.8 MB 5.1 MB/s eta 0:00:03
     ------- -------------------------------- 2.3/12.8 MB 7.0 MB/s eta 0:00:02
     -------- ------------------------------- 2.8/12.8 MB 7.8 MB/s eta 0:00:02
     ------------ --------------------------- 4.0/12.8 MB 9.8 MB/s eta 0:00:01
     --------------- ------------------------ 4

In [2]:
#Import der Bibliotheken & des spacy-Packets lt. Output
import pandas as pd
import re
import spacy
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation as LDA
from collections import Counter

nlp = spacy.load('en_core_web_sm')

In [3]:
#Kontrolle und Anpassen der Stoppwörter:
print("Original Stoppwörter:")
print(nlp.Defaults.stop_words)

Original Stoppwörter:
{'at', 'herein', 'itself', 'himself', 'twenty', 'below', 'ever', 'from', 'all', 'have', 'get', '’ve', 'he', 'behind', 'very', 'about', 'when', 'hers', 'same', 'toward', 'been', 'until', 'once', 'themselves', 'each', 'a', 'then', 'between', 'but', 'yourself', 'three', 'serious', 'part', 'eleven', 'yours', 'down', 'enough', 'against', 'since', 'beforehand', 'seeming', 'always', 'except', 'seemed', 'herself', 'no', 'anyhow', 'so', 'using', 'regarding', 'noone', 'is', 'hereupon', 'already', 'me', 'sixty', 'wherever', 'last', 'n’t', 'by', 'see', 'with', 'do', "n't", 'the', 'your', 'none', 'out', '‘d', 'latter', 'might', 'may', 'mine', 'am', 'ten', 'less', '’m', 'who', 'they', '’s', 'and', 'make', 'are', 'namely', 'top', 'was', 'whoever', 'name', 'again', 'amount', 'ca', 'him', 'five', 'perhaps', 'towards', 'across', 'still', 'us', 'such', 'became', 'has', 'four', 'any', 'third', 'elsewhere', 'indeed', 'only', 'beyond', 'front', 'or', 'could', 'say', 'seems', 'alone', '

In [4]:
remove_stop_words = {'no', 'not'}
nlp.Defaults.stop_words -= remove_stop_words

print("Stoppwörter:")
print(nlp.Defaults.stop_words)

Stoppwörter:
{'at', 'herein', 'itself', 'himself', 'twenty', 'below', 'ever', 'from', 'all', 'have', 'get', '’ve', 'he', 'behind', 'very', 'about', 'when', 'hers', 'same', 'toward', 'been', 'until', 'once', 'themselves', 'each', 'a', 'then', 'between', 'but', 'yourself', 'three', 'serious', 'part', 'eleven', 'yours', 'down', 'enough', 'against', 'since', 'beforehand', 'seeming', 'always', 'except', 'seemed', 'herself', 'anyhow', 'so', 'using', 'regarding', 'noone', 'is', 'hereupon', 'already', 'me', 'sixty', 'wherever', 'last', 'n’t', 'by', 'see', 'with', 'do', "n't", 'the', 'your', 'none', 'out', '‘d', 'latter', 'might', 'may', 'mine', 'am', 'ten', 'less', '’m', 'who', 'they', '’s', 'and', 'make', 'are', 'namely', 'top', 'was', 'whoever', 'name', 'again', 'amount', 'ca', 'him', 'five', 'perhaps', 'towards', 'across', 'still', 'us', 'such', 'became', 'has', 'four', 'any', 'third', 'elsewhere', 'indeed', 'only', 'beyond', 'front', 'or', 'could', 'say', 'seems', 'alone', 'whereas', 'real

In [5]:
#Vorverarbeitung des Textes:
def preprocess_text(text):
    #Textformat muss ein String sein:
    if not isinstance(text, str):
        text = ""
    #Sonderzeichen entfernen:
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    #Umwandlung in Kleinbuchstaben: 
    doc = nlp(text.lower())
    #Stoppwörter entfernen und Lemmatisierung:
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

In [9]:
#CSV-Datei einlesen & ausführen der definierten Vorverarbeitung:
csv_datei = 'amazon_reviews.csv'

try:
    df = pd.read_csv(csv_datei)
    print("Einlesen erfolgreich.")
except FileNotFoundError:
    print(f"Fehler: Datei unter '{csv_datei}' nicht gefunden.")
except pd.errors.ParserError:
    print("Fehler beim Parsen der CSV-Datei. Bitte überprüfen Sie das Dateiformat.")
except Exception as e:
    print(f"Ein unerwarteter Fehler ist aufgetreten: {e}")
    
if 'df' in locals():
    #Auswahl der Spalte 'reviewText'
    if 'reviewText' in df.columns:
        #Überprüfen und Bereinigen der Daten:
        df['reviewText'] = df['reviewText'].astype(str)
        df['cleaned_reviewText'] = df['reviewText'].apply(preprocess_text)
    else:
        print("Fehler")

Einlesen erfolgreich.


In [10]:
#Implementierung der BoW-Methode mittels scikit-Learn:
if 'cleaned_reviewText' in df.columns:
    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transform(df['cleaned_reviewText'])
else:
    print("Fehler")

In [11]:
#Implementierung der BoW-Methode mittels numpy:
def bow_with_numpy(texts):
    #Erstellen des Vokabulars
    vocab = list(set(" ".join(texts).split()))
    vocab.sort()
    vocab_dict = {word: idx for idx, word in enumerate(vocab)}
    
    #Erstellen der BoW-Matrix
    bow_matrix = np.zeros((len(texts), len(vocab)))
    for i, text in enumerate(texts):
        word_count = Counter(text.split())
        for word, count in word_count.items():
            if word in vocab_dict:
                bow_matrix[i, vocab_dict[word]] = count
    return bow_matrix, vocab

if 'cleaned_reviewText' in df.columns:
    bow_matrix_np, vocab_np = bow_with_numpy(df['cleaned_reviewText'])
else:
    print("Fehler")

In [12]:
#Anzeigen der beiden erstellten Vektoren:
#scikit-Learn:
if 'X_bow' in locals():
    print("BoW-Vektoren (scikit-learn):")
    print(X_bow.toarray())
else:
    print("Fehler")

#numpy:
if 'bow_matrix_np' in locals():
    print("BoW-Vektoren (numpy):")
    print(bow_matrix_np)
else:
    print("Fehler")

BoW-Vektoren (scikit-learn):
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
BoW-Vektoren (numpy):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [18]:
#Funktion zum Anzeigen der Themen:
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print(" ".join([words[i] for i in topic.argsort()[:-top_n - 1:-1]]))

#Häufigste Themen mittels LSA:
n_topics = 5

if 'X_bow' in locals():
    lsa_model = TruncatedSVD(n_components=n_topics, random_state=42)
    lsa_topic_matrix = lsa_model.fit_transform(X_bow)

    print("LSA Topics:")
    print_topics(lsa_model, vectorizer)
    
#Häufigste Themen mittels LDA:
    lda_model = LDA(n_components=n_topics, random_state=42)
    lda_topic_matrix = lda_model.fit_transform(X_bow)

    print("LDA Topics:")
    print_topics(lda_model, vectorizer)
else:
    print("Fehler")

LSA Topics:
Topic 0:
card gb phone sandisk work not sd memory no buy
Topic 1:
card class write mbs exfat test file mb result read
Topic 2:
phone card music restart problem play photo issue format try
Topic 3:
sandisk problem work card product issue gs star review replacement
Topic 4:
sandisk phone gb not product ultra class new speed problem
LDA Topics:
Topic 0:
card gb format surface fat device exfat pro storage file
Topic 1:
work great no tablet galaxy buy card use price memory
Topic 2:
card speed video fast write gb camera gopro class no
Topic 3:
card phone gb work not memory galaxy samsung sandisk sd
Topic 4:
card work not sandisk good buy price sd product great


In [22]:
#Funktion zum Zuweisen der Themen zu den Dokumenten:
def assign_topics(topic_matrix, n_top_documents=5):
    assignments = {}
    for topic_idx, topic in enumerate(topic_matrix.T):
        assignments[f'Topic {topic_idx}'] = []
        top_document_indices = topic.argsort()[:-n_top_documents - 1:-1]
        for doc_index in top_document_indices:
            assignments[f'Topic {topic_idx}'].append(df['cleaned_reviewText'].iloc[doc_index])
    return assignments

#Funktion zum Erstellen eines DataFrames:
def create_topic_dataframe(model, vectorizer, n_top_words=10):
    words = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(f"Topic {topic_idx + 1}: " + ", ".join(topic_words))
    return pd.DataFrame(topics, columns=["Top Words"])

if 'lsa_topic_matrix' in locals() and 'lda_topic_matrix' in locals():
    #Dokumentzuweisungen für LSA:
    print("Document assignments for LSA:")
    lsa_assignments = assign_topics(lsa_topic_matrix)
    lsa_assignments_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in lsa_assignments.items()]))
    display(lsa_assignments_df)

    #Dokumentzuweisungen für LDA:
    print("Document assignments for LDA:")
    lda_assignments = assign_topics(lda_topic_matrix)
    lda_assignments_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in lda_assignments.items()]))
    display(lda_assignments_df)

    #DataFrames für die Themen:
    lsa_topics_df = create_topic_dataframe(lsa_model, vectorizer)
    lda_topics_df = create_topic_dataframe(lda_model, vectorizer)

    print("LSA Topics:")
    display(lsa_topics_df)

    print("LDA Topics:")
    display(lda_topics_df)
else:
    print("Fehler")

Document assignments for LSA:


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4
0,note read update scroll m leave review s...,hii order card arrive day fast prime unfortuna...,monday november galaxy s format restart phon...,note read update scroll m leave review s...,note read update scroll m leave review s...
1,monday november galaxy s format restart phon...,test dozen sdhc microsdhc card disturb trend n...,buy use samsung galaxy s pop card mount format...,buy card originally galaxy note ii intention p...,update lovely wife buy samsung galaxy tab...
2,hii order card arrive day fast prime unfortuna...,buy card originally galaxy note ii intention p...,samsung galaxy note come gig internal memo...,buy wife android phone fail month figure...,look class microsd card order directly trans...
3,update lovely wife buy samsung galaxy tab...,purchase card sandisk packaging pretty sure ge...,update suppose last forever year card go bad...,update lovely wife buy samsung galaxy tab...,buy wife android phone fail month figure...
4,buy card originally galaxy note ii intention p...,buy gb version card use gopro hero black edi...,phone recognize card follow suggestion format ...,problem sd card prior good track record sandis...,test dozen sdhc microsdhc card disturb trend n...


Document assignments for LDA:


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4
0,read place work android tablet find card requi...,es una reconocida marca y la calidad de sus pr...,purchase ultra microsdhc class card medium p...,monday november galaxy s format restart phon...,buy card fail usage kicker favoris month r...
1,problem get format fat use myexcellent sansa c...,use small asus notepad little extra storage ...,description amazon page call class card uhs ...,buy card originally galaxy note ii intention p...,ve buy sandisk warranty reasoning lifetime war...
2,find write bandwidth card mbsec transfer stead...,micro disk fit samsung phone perfectly able us...,sandisk make lot claim card questionable highc...,update suppose last forever year card go bad...,know issue sandisk gopro hero camera pay atten...
3,old generation sandisk gb card bough lot fas...,let thing bang pocket key coin accidentally co...,highly recommend sandisk ultra gb microsdhc ...,nexus list support maximum gb sdcardbase rep...,buy separately frustration free package appare...
4,little work card excellent way inexpensivly up...,use show time fast came wrap small paper packe...,description confusing say time class mean ...,order gb gb version memory card write review a...,past ve buy lot cf sd card usb key year ago bu...


LSA Topics:


Unnamed: 0,Top Words
0,"Topic 1: card, gb, phone, sandisk, work, not, ..."
1,"Topic 2: card, class, write, mbs, exfat, test,..."
2,"Topic 3: phone, card, music, restart, problem,..."
3,"Topic 4: sandisk, problem, work, card, product..."
4,"Topic 5: sandisk, phone, gb, not, product, ult..."


LDA Topics:


Unnamed: 0,Top Words
0,"Topic 1: card, gb, format, surface, fat, devic..."
1,"Topic 2: work, great, no, tablet, galaxy, buy,..."
2,"Topic 3: card, speed, video, fast, write, gb, ..."
3,"Topic 4: card, phone, gb, work, not, memory, g..."
4,"Topic 5: card, work, not, sandisk, good, buy, ..."
