In [53]:
import pandas as pd
import re
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from scipy.stats import randint, uniform
from xgboost import XGBClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import LdaModel

import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('D:/Git/phase_4/Hades_reviews.csv')

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457440 entries, 0 to 457439
Data columns (total 27 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Unnamed: 0                      457440 non-null  int64  
 1   query_summary                   0 non-null       float64
 2   cursors                         0 non-null       float64
 3   recommendationid                228720 non-null  float64
 4   language                        228720 non-null  object 
 5   review                          228017 non-null  object 
 6   timestamp_created               228720 non-null  float64
 7   timestamp_updated               228720 non-null  float64
 8   voted_up                        228720 non-null  object 
 9   votes_up                        228720 non-null  float64
 10  votes_funny                     228720 non-null  float64
 11  weighted_vote_score             228720 non-null  float64
 12  comment_count   

In [47]:
#Drop Nulls
df = df.dropna(subset=['review'])

#Keep only English reviews
df = df[df['language'] == 'english']

# Drop Unnecessary Columns
df = df.drop(df.columns[[0, 1, 2, 3, 4, 6, 7, 16, 17, 18]], axis=1)

# Create a mask where each review has more than 5 words and at least one alphabetic character
mask = df['review'].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))) > 5 and bool(re.search('[a-zA-Z]', str(x))))

# Apply the mask to the DataFrame to filter out review
df = df[mask]

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76744 entries, 228720 to 457437
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   review                          76744 non-null  object 
 1   voted_up                        76744 non-null  object 
 2   votes_up                        76744 non-null  float64
 3   votes_funny                     76744 non-null  float64
 4   weighted_vote_score             76744 non-null  float64
 5   comment_count                   76744 non-null  float64
 6   steam_purchase                  76744 non-null  object 
 7   received_for_free               76744 non-null  object 
 8   written_during_early_access     76744 non-null  object 
 9   author.num_games_owned          76744 non-null  float64
 10  author.num_reviews              76744 non-null  float64
 11  author.playtime_forever         76744 non-null  float64
 12  author.playtime_last_two_w

In [49]:
df.head()

Unnamed: 0,review,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,author.playtime_at_review,author.last_played,timestamp_dev_responded,developer_response
228720,"Beautiful art and music, fun gameplay and grea...",True,0.0,0.0,0.0,0.0,True,False,False,0.0,1.0,18400.0,0.0,18400.0,1624387000.0,,
228721,"Hades has a lot going for it the soundtrack, v...",True,0.0,0.0,0.0,0.0,True,False,False,189.0,44.0,1011.0,1011.0,1011.0,1686744000.0,,
228723,"perfect loop, beautiful art, fun weapons",True,0.0,0.0,0.0,0.0,True,False,False,0.0,14.0,5790.0,5790.0,5790.0,1686743000.0,,
228724,Combat : 10/10\nReplayabilty : 10/10\nStory + ...,True,0.0,0.0,0.0,0.0,True,False,False,0.0,4.0,5399.0,0.0,5399.0,1670424000.0,,
228726,fun but u die alot LOL,True,0.0,0.0,0.0,0.0,False,False,False,0.0,2.0,330.0,330.0,270.0,1686744000.0,,


In [50]:
df['voted_up'].value_counts()

True     75508
False     1236
Name: voted_up, dtype: int64

In [51]:
df['author.playtime_forever'].describe()

count     76744.000000
mean       5169.432190
std        6119.080535
min           5.000000
25%        1859.000000
50%        3914.000000
75%        6598.000000
max      272341.000000
Name: author.playtime_forever, dtype: float64

In [52]:
# Calculate the length of each review (in words)
df['review_length'] = df['review'].apply(lambda x: len(x.split()))

# Calculate the average length of reviews
average_length = df['review_length'].mean()

df['review_length'].describe()

count    76744.000000
mean        48.357474
std         85.016701
min          1.000000
25%         11.000000
50%         22.000000
75%         50.000000
max       1600.000000
Name: review_length, dtype: float64

In [9]:
# Get list of stopwords
stop_words = set(stopwords.words('english'))

#Setup lemmatizer
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    filtered_words = [lemmatizer.lemmatize(w) for w in words if w.lower() not in stop_words]
    return ' '.join(filtered_words)

# Lemmatize the reviews
df['review'] = df['review'].apply(lemmatize_text)

In [10]:
df['review_length'].describe()

count    76744.000000
mean        48.357474
std         85.016701
min          1.000000
25%         11.000000
50%         22.000000
75%         50.000000
max       1600.000000
Name: review_length, dtype: float64

In [11]:
# Encode review length into categories based on specific ranges or thresholds
df['review_length_category'] = pd.cut(df['review_length'], bins=[0, 8, 18, 44, np.inf], labels=[0, 1, 2, 3])

In [12]:
df['review_length_category'].value_counts()

3    21715
1    21613
2    21249
0    12167
Name: review_length_category, dtype: int64

In [13]:
df = df.dropna(subset=['review_length_category'])

In [14]:
# Calculate average playtime
average_playtime = df['author.playtime_forever'].mean()

# Create new binary column
df['above_average_playtime'] = np.where(df['author.playtime_forever'] > average_playtime, 1, 0)

In [15]:
df['above_average_playtime'].value_counts()

0    48192
1    28552
Name: above_average_playtime, dtype: int64

In [16]:
# Convert any non-string data in the review column to a string
df['review'] = df['review'].astype(str)

In [17]:
# Create a vectorizer object to generate term frequency-inverse document frequency (tf-idf) vectors
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['review'])

# Convert the TF-IDF matrix into a DataFrame
df_tfidf = pd.DataFrame(df_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Reset index before concatenation
df.reset_index(drop=True, inplace=True)
df_tfidf.reset_index(drop=True, inplace=True)

# Concatenate the TF-IDF features with your original DataFrame
df = pd.concat([df, df_tfidf], axis=1)

In [None]:
# Get the labels (positive or negative)
y = df['voted_up'].map({True: 1, False: 0})

In [18]:
df.head()

Unnamed: 0,review,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,author.num_games_owned,...,yo,you,young,youre,youtube,zag,zagreus,zero,zeus,zone
0,"Beautiful art music , fun gameplay great voice...",True,0.0,0.0,0.0,0.0,True,False,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Hades lot going soundtrack , voice acting , ar...",True,0.0,0.0,0.0,0.0,True,False,False,189.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"perfect loop , beautiful art , fun weapon",True,0.0,0.0,0.0,0.0,True,False,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Combat : 10/10 Replayabilty : 10/10 Story + Wr...,True,0.0,0.0,0.0,0.0,True,False,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,fun u die alot LOL,True,0.0,0.0,0.0,0.0,False,False,False,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df['above_average_playtime'].isna().sum()

0

In [20]:
#Split our Training and Test sets using only a portion of the data in order to save time
X_train, X_test, y_train, y_test = train_test_split(
    df_tfidf,
    df['review_length_category'],
    test_size=0.1,
    train_size=0.4,
    stratify=df['review_length_category'],
    random_state=42
)

In [21]:
# Initialize the RFC model
rf = RandomForestClassifier(random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Predict the training set results
y_pred_rf_train = rf.predict(X_train)

# Generate classification reports for the training data
rf_report_train = classification_report(y_train, y_pred_rf_train)

In [22]:
# Initialize the model
xgb = XGBClassifier(random_state=42, verbosity=0)

# Fit the Model
xgb.fit(X_train, y_train)

# Predict the training set results
y_pred_xgb_train = xgb.predict(X_train)

# Generate classification reports for the training data
xgb_report_train = classification_report(y_train, y_pred_xgb_train)

In [23]:
# Print the classification reports for the training data
print("Training Data - Random Forest Classification Report:")
print(rf_report_train)
print()
print("Training Data - XGBoost Classification Report:")
print(xgb_report_train)

Training Data - Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4867
           1       1.00      0.99      1.00      8645
           2       1.00      1.00      1.00      8499
           3       1.00      1.00      1.00      8686

    accuracy                           1.00     30697
   macro avg       1.00      1.00      1.00     30697
weighted avg       1.00      1.00      1.00     30697


Training Data - XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      4867
           1       0.82      0.84      0.83      8645
           2       0.93      0.88      0.90      8499
           3       0.99      0.98      0.98      8686

    accuracy                           0.89     30697
   macro avg       0.88      0.89      0.88     30697
weighted avg       0.89      0.89      0.89     30697



# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 6),
    'learning_rate': uniform(0.001, 0.1)
}

# Initialize the XGBoost classifier
xgb = XGBClassifier(random_state=42, verbosity=0)

# Initialize RandomizedSearchCV with the XGBoost classifier and parameter distribution
rand_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=10, cv=3, random_state=42)

# Fit the RandomizedSearchCV model
rand_search.fit(X_train, y_train)

# Get the best parameters found by RandomizedSearchCV
best_params = rand_search.best_params_
print("Best parameters:", best_params)

# Use the best parameters to initialize a new XGBoost classifier
best_xgb = XGBClassifier(random_state=42, verbosity=0, **best_params)

# Fit the new XGBoost model with the best parameters
best_xgb.fit(X_train, y_train)

# Predict the training set results using the best model
y_pred_xgb_train = best_xgb.predict(X_train)

# Generate classification report for the training data
xgb_report_train = classification_report(y_train, y_pred_xgb_train)

# Print the classification report
print("Classification Report (Training Data):\n", xgb_report_train)

This code creates a new DataFrame sentiments where each row corresponds to a review and each column corresponds to a theme. Each cell contains the average sentiment of the sentences in the corresponding review that mention the corresponding theme.

Please note that this is a very simple example. In a more sophisticated sentiment analysis, you might use a more advanced model for sentiment analysis (such as a pre-trained model from HuggingFace's model hub), or use a more sophisticated method to identify sentences relevant to each theme. Also, error checking code is omitted for brevity, but you might want to add checks for things like division by zero.

In [32]:
# Initialize the model
xgb = XGBClassifier(random_state=42, verbosity=0)

# Fit the model
xgb.fit(X_train, y_train)

# Predict the test set results
y_pred_xgb_test = xgb.predict(X_test)

# Generate classification report for the test data
xgb_report_test = classification_report(y_test, y_pred_xgb_test)

# Print the classification report for the test data
print("XGBoost Classification Report (Test Data):")
print(xgb_report_test)

XGBoost Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.67      0.72      0.69      1217
           1       0.69      0.73      0.71      2161
           2       0.82      0.76      0.79      2125
           3       0.94      0.92      0.93      2172

    accuracy                           0.79      7675
   macro avg       0.78      0.78      0.78      7675
weighted avg       0.79      0.79      0.79      7675



In [31]:
# Specify themes and synonyms
themes = {
    'music': ['sound', 'music', 'audio', 'instrument', 'soundtrack', 'voice acting', 'song', 'effect', 'atmosphere', 'orchestra'],
    'story': ['story', 'plot', 'narrative', 'character', 'mission', 'quest', 'writing', 'dialogue', 'relationships', 'family', 'gods'],
    'gameplay': ['gameplay', 'mechanics', 'controls', 'action', 'fight', 'attack', 'battle', 'weapon', 'moves', 'power', 'combat', 'upgrade'],
    'visuals': ['visuals', 'graphics', 'art', 'images', 'color', 'artwork', 'animation', '2D', '3D', 'lighting']
}

# Create a new DataFrame for sentences
sentences_df = pd.DataFrame(columns=['sentences'])

# Tokenize the reviews into sentences and add them to the new DataFrame
for review in df['review']:
    sentences = nltk.sent_tokenize(str(review))
    sentences_df = sentences_df.append({'sentences': sentences}, ignore_index=True)

# Concatenate the sentences DataFrame with the original DataFrame
df = pd.concat([df, sentences_df], axis=1)

for index, row in df.iterrows():
    sentiment = {}
    if isinstance(row['sentences'], list):
        sentences = [tuple(sentence) for sublist in row['sentences'] for sentence in sublist]  # Flatten and convert lists to tuples
        sentences = ' '.join([' '.join(sentence) for sentence in sentences])  # Convert tuples to strings and join them
        blob = TextBlob(sentences, analyzer=NaiveBayesAnalyzer())
        for sentence in row['sentences']:
            sentiment[tuple(sentence)] = blob.sentiment.classification  # Use tuples as keys
    df.at[index, 'sentiment'] = str(sentiment)

# Combine the sentiments DataFrame with the original reviews DataFrame
df = pd.concat([df, df['sentiment']], axis=1)

Use the Gensim Library to perform LDA

In [35]:
# Define the preprocessing function
def preprocess(text):
    result = []
    for token in simple_preprocess(str(text)):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatizer.lemmatize(token, pos='v'))
    return result

# Apply preprocessing to each review
df['processed_reviews'] = df['review'].apply(preprocess)

TypeError: decoding to str: need a bytes-like object, Series found

After preprocessing the reviews, you need to create a dictionary and a corpus that Gensim's LDA implementation can work with:

In [43]:
# Tokenize each review string into a list of tokens
tokenized_reviews = [review.split() for review in df['review']]

# Create a dictionary representation of the documents
dictionary = Dictionary(tokenized_reviews)

# Filter out words that occur less than 20 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Create Bag-of-words representation of the documents
corpus = [dictionary.doc2bow(review) for review in tokenized_reviews]

In [42]:
# Set training parameters
num_topics = 4
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

KeyError: 0

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

for idx, (topic, coherence) in enumerate(top_topics):
    print('Topic %d:' % idx)
    print(' '.join([dictionary[int(word)] for word, _ in topic[:10]]), '\n')
