# Preprocess data

In [231]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split

## 1. Steam review

In [232]:
df_steam = pd.read_csv('raw_data/train.csv')

In [233]:
df_steam

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1
...,...,...,...,...,...
17489,25535,EverQuest II,2012.0,Arguably the single greatest mmorp that exists...,1
17490,25536,EverQuest II,2017.0,"An older game, to be sure, but has its own cha...",1
17491,25537,EverQuest II,2011.0,When I frist started playing Everquest 2 it wa...,1
17492,25538,EverQuest II,,cool game. THe only thing that REALLY PISSES M...,1


Drop unnecessary columns

In [234]:
df_steam = df_steam.drop(columns=['review_id', 'title', 'year'])

Check missing values

In [235]:
df_steam.isna().sum()

user_review        0
user_suggestion    0
dtype: int64

Check duplicates

In [236]:
df_steam.duplicated().sum()

np.int64(3)

In [237]:
df_steam = df_steam.drop_duplicates()

In [238]:
df_steam['user_suggestion'].value_counts()

user_suggestion
1    9967
0    7524
Name: count, dtype: int64

Normalize text

In [239]:
def normalize_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"@\w+", "", text)     # remove mentions
    text = re.sub(r"#\w+", "", text)     # remove hashtags
    text = re.sub(r"[^a-z\s]", "", text) # remove punctuation and numbers
    text = re.sub(r"\s+", " ", text)     # remove extra whitespace
    return text.strip()

df_steam['user_review'] = df_steam['user_review'].apply(normalize_text)

Remove stopwords

In [240]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

df_steam['user_review'] = df_steam['user_review'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ping\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Eliminate row with whitespace

In [241]:
empty_review_mask = df_steam['user_review'].astype(str).str.strip() == ''
print(f"Number of empty or whitespace-only user_review rows: {empty_review_mask.sum()}")

Number of empty or whitespace-only user_review rows: 29


In [242]:
df_steam = df_steam[~empty_review_mask].reset_index(drop=True)

Remove short text

In [243]:
short_content_count = df_steam[df_steam['user_review'].str.len() <= 20].shape[0]
print(f"Number of short content rows: {short_content_count}")

Number of short content rows: 34


In [244]:
df_steam = df_steam[df_steam['user_review'].str.len() > 20] 

In [245]:
df_steam

Unnamed: 0,user_review,user_suggestion
0,im scared hearing creepy voices ill pause mome...,1
1,best game better sam peppers youtube account y...,1
2,littly iffy controls know play easy master ive...,1
3,great game fun colorful thata side note though...,1
4,many games cute tag right next horror tag stea...,1
...,...,...
17457,arguably single greatest mmorp exists today fr...,1
17458,older game sure charm holds special place hear...,1
17459,frist started playing everquest amazing still ...,1
17460,cool game thing really pisses ridable transpor...,1


## 2. Reddit review

In [246]:
df_reddit = pd.read_csv('raw_data/cleaned_comments.csv')

In [247]:
df_reddit

Unnamed: 0.1,Unnamed: 0,comment,sentiment
0,0,dont think like game havent even played minute...,negative
1,1,leave farm smaller creature either wait help k...,negative
2,2,nothing beat feeling get see fall love like ye...,positive
3,3,also theyre made paper edit tried make gif fai...,negative
4,4,haha exactly brother tried get wow run raid ge...,positive
...,...,...,...
21816,23184,dont angry,positive
21817,23185,human free developed relationship esteemed fellow,positive
21818,23186,dont call shot around mister,neutral
21819,23187,sorry going edit beauty mark face,positive


Drop index column

In [248]:
df_reddit = df_reddit.drop(df_reddit.columns[0], axis=1)

Eliminate neutral sentiment records

In [249]:
df_reddit = df_reddit[df_reddit['sentiment'] != 'neutral']

Check missing values

In [250]:
df_reddit.isna().sum()

comment      16
sentiment     0
dtype: int64

In [251]:
df_reddit = df_reddit.dropna(subset='comment')

Check duplicates

In [252]:
df_reddit.duplicated().sum()

np.int64(396)

In [253]:
df_reddit = df_reddit.drop_duplicates()

Remove stopwords

In [254]:
df_reddit['comment'] = df_reddit['comment'].apply(remove_stopwords)

Map sentiment column

In [255]:
df_reddit['sentiment'] = df_reddit['sentiment'].map({'positive': 1, 'negative': 0})

Check profanity words and remove them

In [256]:
profanity_list = {
    "fuck", "shit", "bitch", "asshole", "bastard", "damn", "dick", "crap",
    "fucking", "piss", "slut", "douche", "cock", "nigger", "nigga", "cunt",
    "whore", "motherfucker", "bullshit", "fag", "retard"
}

def contains_profanity(text):
    if isinstance(text, str):
        words = set(re.findall(r'\w+', text.lower()))
        return bool(words & profanity_list)
    return False

num_profanity = df_reddit['comment'].apply(contains_profanity).sum()
print(f"Reviews with profanity: {num_profanity}")

Reviews with profanity: 1387


In [257]:
df_reddit = df_reddit[~df_reddit['comment'].apply(contains_profanity)].reset_index(drop=True)

Remove short text

In [258]:
short_content_count = df_reddit[df_reddit['comment'].str.len() <= 20].shape[0]
print(f"Number of short content rows: {short_content_count}")

Number of short content rows: 1997


In [259]:
df_reddit = df_reddit[df_reddit['comment'].str.len() > 20] 

Rename columns

In [260]:
df_reddit = df_reddit.rename(columns={
    'comment': 'user_review',
    'sentiment': 'user_suggestion'
})

In [261]:
df_reddit

Unnamed: 0,user_review,user_suggestion
0,dont think like game havent even played minute...,0
1,leave farm smaller creature either wait help k...,0
2,nothing beat feeling get see fall love like ye...,1
3,also theyre made paper edit tried make gif fai...,0
4,haha exactly brother tried get wow run raid ge...,1
...,...,...
12002,unless figure make face without dot edit give ...,0
12003,pa love child cousin equally jezzabell,1
12006,human free developed relationship esteemed fellow,1
12007,sorry going edit beauty mark face,1


## 3. Main dataset

In [262]:
df = pd.concat([df_steam, df_reddit], axis=0).reset_index(drop=True)

In [263]:
df

Unnamed: 0,user_review,user_suggestion
0,im scared hearing creepy voices ill pause mome...,1
1,best game better sam peppers youtube account y...,1
2,littly iffy controls know play easy master ive...,1
3,great game fun colorful thata side note though...,1
4,many games cute tag right next horror tag stea...,1
...,...,...
27435,unless figure make face without dot edit give ...,0
27436,pa love child cousin equally jezzabell,1
27437,human free developed relationship esteemed fellow,1
27438,sorry going edit beauty mark face,1


In [264]:
df.to_csv('review_preprocessed.csv')

In [265]:
df['user_suggestion'].value_counts()

user_suggestion
1    17344
0    10096
Name: count, dtype: int64

Split train + valid + test (8:1:1)

In [266]:
# First split: 80% train, 20% temp (val + test)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['user_suggestion']  
)

# Second split: 10% val, 10% test (from the 20% temp)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['user_suggestion']
)

In [267]:
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')
val_df.to_csv('val.csv')