In [1]:
import numpy as np
import random
np.random.seed(42)
random.seed(42)

In [2]:
import pandas as pd
reviews = pd.read_csv("https://raw.githubusercontent.com/kbrennig/MODS_WS24_25/refs/heads/main/data/imdb_sample.csv")

In [3]:
reviews['sentiment_positive'] = np.where(reviews['sentiment_human'] == 'positive', 1, 0)

In [4]:
from sklearn.model_selection import train_test_split

X = reviews.drop(columns=['id','sentiment_human','sentiment_positive'])
y = reviews['sentiment_positive']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess(text):
    tokens = nltk.word_tokenize(text)

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    stopwords = nltk.corpus.stopwords.words("english")
    filtered_tokens = [token for token in lemmatized_tokens if token.lower() not in stopwords]

    filtered_tokens_nopunct = [re.sub(r'[^\w\s]', '', token) for token in filtered_tokens if token]

    return filtered_tokens_nopunct


In [6]:
X_train['tokens'] = X_train['text'].apply(preprocess)
X_test['tokens'] = X_test['text'].apply(preprocess)

In [7]:
X_train['tokens'] = X_train['tokens'].apply(lambda tokens: [token for token in tokens if token not in {'movie', 'film'}])
X_test['tokens'] = X_test['tokens'].apply(lambda tokens: [token for token in tokens if token not in {'movie', 'film'}])

In [8]:
from gensim import corpora

dictionary = corpora.Dictionary(X_train['tokens'])
dictionary.filter_extremes(no_below=5)

corpus_train = [dictionary.doc2bow(text) for text in X_train['tokens']]
corpus_test = [dictionary.doc2bow(text) for text in X_test['tokens']]

In [9]:
from gensim.models.ldamodel import LdaModel

k=12
model_12 = LdaModel(corpus=corpus_train, num_topics=k, id2word = dictionary, iterations=100, random_state=42)


In [10]:
def get_document_topic_distribution(model, corpus):
    return pd.DataFrame(
        [
            [prob for _, prob in model.get_document_topics(doc, minimum_probability=0)]
            for doc in corpus
        ],
        columns=[f'Topic{i+1}' for i in range(model.num_topics)]
    )

train_topic_distributions = get_document_topic_distribution(model_12, corpus_train)
test_topic_distributions = get_document_topic_distribution(model_12, corpus_test)

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf_topicmodel = RandomForestClassifier(random_state=42).fit(train_topic_distributions, y_train)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score

predictions_testset_rf_topicmodel = rf_topicmodel.predict_proba(test_topic_distributions)[:, 1]
predictions_testset_rf_topicmodel_binary = np.where(predictions_testset_rf_topicmodel > 0.5, 1, 0)

accuracy_rf = accuracy_score(y_test, predictions_testset_rf_topicmodel_binary)
print("Accuracy (Random Forests):", accuracy_rf)

ConfusionMatrixDisplay.from_predictions(y_test, predictions_testset_rf_topicmodel_binary)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay

auc_score = roc_auc_score(y_test, predictions_testset_rf_topicmodel)
print("AUC Score:", auc_score)

RocCurveDisplay.from_predictions(y_test, predictions_testset_rf_topicmodel, plot_chance_level=True)