In [2]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import LatentDirichletAllocation

%matplotlib inline

# Import the data
df = pd.read_csv(r'SuperGiant_reviews.csv', header= 0)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305998 entries, 0 to 305997
Data columns (total 7 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   language                305998 non-null  object 
 1   review                  305202 non-null  object 
 2   votes_up                305998 non-null  int64  
 3   votes_funny             305998 non-null  int64  
 4   weighted_vote_score     305998 non-null  float64
 5   author.num_games_owned  305998 non-null  int64  
 6   author.num_reviews      305998 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 16.3+ MB


In [4]:
#Drop Nulls
df = df.dropna(subset=['review'])

#Keep only English reviews
df = df[df['language'] == 'english']

# Create a mask where each review has more than one word and at least one alphabetic character
mask = df['review'].apply(lambda x: len(re.findall(r'\b\w+\b', str(x))) > 1 and bool(re.search('[a-zA-Z]', str(x))))

# Apply the mask to the DataFrame to filter out reviews
df = df[mask]

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151867 entries, 1 to 305996
Data columns (total 7 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   language                151867 non-null  object 
 1   review                  151867 non-null  object 
 2   votes_up                151867 non-null  int64  
 3   votes_funny             151867 non-null  int64  
 4   weighted_vote_score     151867 non-null  float64
 5   author.num_games_owned  151867 non-null  int64  
 6   author.num_reviews      151867 non-null  int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 9.3+ MB


In [5]:
# Convert all reviews to string type
df['review'] = df['review'].astype(str)

# Instantiate a PorterStemmer
ps = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = re.sub('[^a-z]', ' ', text)
    
    # Split the text into words and remove stopwords, then apply stemming
    text = ' '.join(ps.stem(word) for word in text.split() if word not in set(stopwords.words('english')))
    
    return text

# Apply the preprocessing to each review
df['processed_review'] = df['review'].apply(preprocess_text)

# Filter out any reviews that are now just 'nan'
df = df[df['processed_review'] != 'nan']

In [6]:
df['processed_review']

1                good game real option play og tetri though
2                                              great addict
3                                  block block blocvk heehe
6                                        warn highli addict
7         onlin matchmak wild setup grief go match compl...
                                ...                        
305985    littl play game alreadi hook beauti soundtrack...
305987                                      pretti good far
305990                                        feast eye ear
305995    review select quot review culmin compel narr g...
305996                  creat game offici guid supergi game
Name: processed_review, Length: 152809, dtype: object

In [7]:
# Save the preprocessed data to a CSV file
df.to_csv('preprocessed_data.csv', index=False)

In [27]:
vectorizer = TfidfVectorizer(max_features=2500) # Use the top 2500 words as features
X = vectorizer.fit_transform(df['review']).toarray()

In [30]:
# Initialize CountVectorizer
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# Fit and transform the processed titles
dtm = vectorizer.fit_transform(df['processed_review'])

# Initialize LDA Model with 10 topics
LDA = LatentDirichletAllocation(n_components=10,random_state=42)

# Fit the model to Bag of Words
LDA.fit(dtm)

# To see top 10 words per topics
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

KeyError: 'processed_review'

In [28]:
le = LabelEncoder()
y = le.fit_transform(df['label']) # Assuming your labels column is named 'label'

KeyError: 'label'