In [1]:
import re
import string
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

file_path = "reviews_data.csv"
df = pd.read_csv(file_path)

# Hier wird der Text vorbereitet
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return " ".join(stemmed_tokens)

df = df.dropna(subset=['Rating'])
cleaned_reviews = df['Review'].apply(preprocess_text)
ratings = df['Rating']

new_df = pd.DataFrame({'Cleaned_Reviews': cleaned_reviews, 'Ratings': ratings})

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


# Aufteilung in Trainings- und Testdaten
X = new_df['Cleaned_Reviews']
y = new_df['Ratings']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vektorisierung der Textdaten 
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Naive Bayes Classifier wird trainiert
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# Predictions werden gemacht
y_pred = clf.predict(X_test_vec)

# Accuracy Score wird berechnet
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.6524822695035462


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.metrics import classification_report


# Tokenisierung und Padding der Textdaten
max_words = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# Erstellung und Training des LSTM-Modells
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=5, batch_size=32)

# Predictions werden erstellt
y_pred = model.predict(X_test_padded)
y_pred = np.round(y_pred).flatten().astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy}")


print(classification_report(y_test, y_pred))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy Score: 0.5957446808510638
              precision    recall  f1-score   support

         1.0       0.60      1.00      0.75        84
         2.0       0.00      0.00      0.00        19
         3.0       0.00      0.00      0.00         8
         4.0       0.00      0.00      0.00         8
         5.0       0.00      0.00      0.00        22

    accuracy                           0.60       141
   macro avg       0.12      0.20      0.15       141
weighted avg       0.35      0.60      0.44       141



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
