# Preprocess data

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import emoji
import unicodedata

## 1. Steam review

In [2]:
df_steam = pd.read_csv('raw_data/train.csv')

In [3]:
df_steam

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1
...,...,...,...,...,...
17489,25535,EverQuest II,2012.0,Arguably the single greatest mmorp that exists...,1
17490,25536,EverQuest II,2017.0,"An older game, to be sure, but has its own cha...",1
17491,25537,EverQuest II,2011.0,When I frist started playing Everquest 2 it wa...,1
17492,25538,EverQuest II,,cool game. THe only thing that REALLY PISSES M...,1


Drop unnecessary columns

In [4]:
df_steam = df_steam.drop(columns=['review_id', 'title', 'year'])

Check missing values

In [5]:
df_steam.isna().sum()

user_review        0
user_suggestion    0
dtype: int64

Check duplicates

In [6]:
df_steam.duplicated().sum()

3

In [7]:
df_steam = df_steam.drop_duplicates()

In [8]:
df_steam['user_suggestion'].value_counts()

user_suggestion
1    9967
0    7524
Name: count, dtype: int64

Normalize text

In [9]:
def normalize_text(text):
    text = text.lower()  # lowercase

    # remove 'early access review' at the beginning
    text = re.sub(r"^(early access review[\s:\-–—)]*)", "", text, flags=re.IGNORECASE)

    # remove URLs, mentions, hashtags
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)

    # remove numbers
    #text = re.sub(r"\d+", "", text)

    # remove emojis and icons
    text = emoji.replace_emoji(text, replace='')

    # Remove spam patterns: no more than 3 consecutive identical characters
    # e.g., "goooood" → "good", "aaaaawesome" → "awesome"
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text)  # Keep up to 3 repeats
    
    # remove non-printable or control characters + double quotes
    text = ''.join(
        c for c in text 
        if unicodedata.category(c)[0] != 'C' and c.isprintable() and c != '"'
    )

    # normalize whitespace
    text = re.sub(r"\s+", " ", text)

    # remove duplicate consecutive words
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

    # keep only ASCII characters (English text only)
    text = text.encode("ascii", "ignore").decode()

    return text.strip()

# Apply to the DataFrame
df_steam['user_review'] = df_steam['user_review'].apply(normalize_text)

Eliminate row with whitespace

In [10]:
empty_review_mask = df_steam['user_review'].astype(str).str.strip() == ''
print(f"Number of empty or whitespace-only user_review rows: {empty_review_mask.sum()}")

Number of empty or whitespace-only user_review rows: 12


In [11]:
df_steam = df_steam[~empty_review_mask].reset_index(drop=True)

Remove short text

In [12]:
short_content_count = df_steam[df_steam['user_review'].str.len() <= 20].shape[0]
print(f"Number of short content rows: {short_content_count}")

Number of short content rows: 33


In [13]:
df_steam = df_steam[df_steam['user_review'].str.len() > 20] 

In [14]:
df_steam

Unnamed: 0,user_review,user_suggestion
0,i'm scared and hearing creepy voices. so i'll ...,1
1,"best game, more better than sam pepper's youtu...",1
2,"a littly iffy on the controls, but once you kn...",1
3,"great game, fun and colorful and all that.a si...",1
4,not many games have the cute tag right next to...,1
...,...,...
17474,arguably the single greatest mmorp that exists...,1
17475,"an older game, to be sure, but has its own cha...",1
17476,when i frist started playing everquest 2 it wa...,1
17477,cool game. the only thing that really pisses m...,1


## 2. Reddit review

In [15]:
df_reddit = pd.read_csv('raw_data/23k_r_gaming_comments_sentiments.csv')

In [16]:
df_reddit

Unnamed: 0.1,Unnamed: 0,Comment,sentiment
0,0,Them: I don't think I like this game.\n\nMe: B...,negative
1,1,Then you leave them to farm the smaller creatu...,negative
2,2,Nothing beats the feeling you get when you see...,positive
3,3,"[Also, they're made of paper](https://i.imgur....",negative
4,4,Haha... That was exactly it when my brother tr...,positive
...,...,...,...
23184,23184,Don't be angry.,positive
23185,23185,"Myself, a human being of my own free will, has...",positive
23186,23186,YOU DONT CALL THE SHOTS AROUND HERE MISTER,neutral
23187,23187,"Sorry, but are you going to [edit](https://www...",positive


Drop index column

In [17]:
df_reddit = df_reddit.drop(df_reddit.columns[0], axis=1)

Eliminate neutral sentiment records

In [18]:
df_reddit = df_reddit[df_reddit['sentiment'] != 'neutral']

Check missing values

In [19]:
df_reddit.isna().sum()

Comment      0
sentiment    0
dtype: int64

In [20]:
df_reddit = df_reddit.dropna(subset='Comment')

Check duplicates

In [21]:
df_reddit.duplicated().sum()

298

In [22]:
df_reddit = df_reddit.drop_duplicates()

Normalize text

In [23]:
df_reddit['Comment'] = df_reddit['Comment'].apply(normalize_text)

Map sentiment column

In [24]:
df_reddit['sentiment'] = df_reddit['sentiment'].map({'positive': 1, 'negative': 0})

Remove short text

In [25]:
short_content_count = df_reddit[df_reddit['Comment'].str.len() <= 20].shape[0]
print(f"Number of short content rows: {short_content_count}")

Number of short content rows: 1242


In [26]:
df_reddit = df_reddit[df_reddit['Comment'].str.len() > 20] 

Rename columns

In [27]:
df_reddit = df_reddit.rename(columns={
    'Comment': 'user_review',
    'sentiment': 'user_suggestion'
})

In [28]:
df_reddit

Unnamed: 0,user_review,user_suggestion
0,them: i don't think i like this game.me: but y...,0
1,then you leave them to farm the smaller creatu...,0
2,nothing beats the feeling you get when you see...,1
3,"[also, they're made of paper](edit: i tried to...",0
4,haha... that was exactly it when my brother tr...,1
...,...,...
23175,\>:(not unless you figure out how do make this...,0
23179,"pa loves all his children and cousins equally,...",1
23185,"myself, a human being of my own free will, has...",1
23187,"sorry, but are you going to [edit]( this, or i...",1


## 3. Main dataset

In [29]:
df = pd.concat([df_steam, df_reddit], axis=0).reset_index(drop=True)

In [30]:
df

Unnamed: 0,user_review,user_suggestion
0,i'm scared and hearing creepy voices. so i'll ...,1
1,"best game, more better than sam pepper's youtu...",1
2,"a littly iffy on the controls, but once you kn...",1
3,"great game, fun and colorful and all that.a si...",1
4,not many games have the cute tag right next to...,1
...,...,...
30007,\>:(not unless you figure out how do make this...,0
30008,"pa loves all his children and cousins equally,...",1
30009,"myself, a human being of my own free will, has...",1
30010,"sorry, but are you going to [edit]( this, or i...",1


In [31]:
df.to_csv('review_preprocessed.csv')

In [32]:
df['user_suggestion'].value_counts()

user_suggestion
1    18788
0    11224
Name: count, dtype: int64

Split train + valid + test (8:1:1)

In [33]:
# First split: 80% train, 20% temp (val + test)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['user_suggestion']  
)

# Second split: 10% val, 10% test (from the 20% temp)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['user_suggestion']
)

In [34]:
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')
val_df.to_csv('val.csv')