# Preprocess data

In [172]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import emoji
import unicodedata

## 1. Steam review

In [173]:
df_steam = pd.read_csv('raw_data/train.csv')

In [174]:
df_steam

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1
...,...,...,...,...,...
17489,25535,EverQuest II,2012.0,Arguably the single greatest mmorp that exists...,1
17490,25536,EverQuest II,2017.0,"An older game, to be sure, but has its own cha...",1
17491,25537,EverQuest II,2011.0,When I frist started playing Everquest 2 it wa...,1
17492,25538,EverQuest II,,cool game. THe only thing that REALLY PISSES M...,1


Drop unnecessary columns

In [175]:
df_steam = df_steam.drop(columns=['review_id', 'title', 'year'])

Check missing values

In [176]:
df_steam.isna().sum()

user_review        0
user_suggestion    0
dtype: int64

Check duplicates

In [177]:
df_steam.duplicated().sum()

np.int64(3)

In [178]:
df_steam = df_steam.drop_duplicates()

In [179]:
df_steam['user_suggestion'].value_counts()

user_suggestion
1    9967
0    7524
Name: count, dtype: int64

Normalize text

In [180]:
def normalize_text(text):
    text = text.lower()  # lowercase

    # remove 'early access review' at the beginning
    text = re.sub(r"^(early access review[\s:\-–—)]*)", "", text, flags=re.IGNORECASE)

    # remove URLs, mentions, hashtags
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)

    # remove numbers
    text = re.sub(r"\d+", "", text)

    # remove emojis and icons
    text = emoji.replace_emoji(text, replace='')

    # Remove spam patterns: no more than 3 consecutive identical characters
    # e.g., "goooood" → "good", "aaaaawesome" → "awesome"
    text = re.sub(r'(.)\1{3,}', r'\1\1\1', text)  # Keep up to 3 repeats
    
    # remove non-printable or control characters + double quotes
    text = ''.join(
        c for c in text 
        if unicodedata.category(c)[0] != 'C' and c.isprintable() and c != '"'
    )

    # normalize whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()

# Apply to the DataFrame
df_steam['user_review'] = df_steam['user_review'].apply(normalize_text)

Eliminate row with whitespace

In [181]:
empty_review_mask = df_steam['user_review'].astype(str).str.strip() == ''
print(f"Number of empty or whitespace-only user_review rows: {empty_review_mask.sum()}")

Number of empty or whitespace-only user_review rows: 2


In [182]:
df_steam = df_steam[~empty_review_mask].reset_index(drop=True)

Remove short text

In [183]:
short_content_count = df_steam[df_steam['user_review'].str.len() <= 20].shape[0]
print(f"Number of short content rows: {short_content_count}")

Number of short content rows: 13


In [184]:
df_steam = df_steam[df_steam['user_review'].str.len() > 20] 

In [185]:
df_steam

Unnamed: 0,user_review,user_suggestion
0,i'm scared and hearing creepy voices. so i'll ...,1
1,"best game, more better than sam pepper's youtu...",1
2,"a littly iffy on the controls, but once you kn...",1
3,"great game, fun and colorful and all that.a si...",1
4,not many games have the cute tag right next to...,1
...,...,...
17484,arguably the single greatest mmorp that exists...,1
17485,"an older game, to be sure, but has its own cha...",1
17486,when i frist started playing everquest it was ...,1
17487,cool game. the only thing that really pisses m...,1


## 2. Reddit review

In [186]:
df_reddit = pd.read_csv('raw_data/cleaned_comments.csv')

In [187]:
df_reddit

Unnamed: 0.1,Unnamed: 0,comment,sentiment
0,0,dont think like game havent even played minute...,negative
1,1,leave farm smaller creature either wait help k...,negative
2,2,nothing beat feeling get see fall love like ye...,positive
3,3,also theyre made paper edit tried make gif fai...,negative
4,4,haha exactly brother tried get wow run raid ge...,positive
...,...,...,...
21816,23184,dont angry,positive
21817,23185,human free developed relationship esteemed fellow,positive
21818,23186,dont call shot around mister,neutral
21819,23187,sorry going edit beauty mark face,positive


Drop index column

In [188]:
df_reddit = df_reddit.drop(df_reddit.columns[0], axis=1)

Eliminate neutral sentiment records

In [189]:
df_reddit = df_reddit[df_reddit['sentiment'] != 'neutral']

Check missing values

In [190]:
df_reddit.isna().sum()

comment      16
sentiment     0
dtype: int64

In [191]:
df_reddit = df_reddit.dropna(subset='comment')

Check duplicates

In [192]:
df_reddit.duplicated().sum()

np.int64(396)

In [193]:
df_reddit = df_reddit.drop_duplicates()

Map sentiment column

In [194]:
df_reddit['sentiment'] = df_reddit['sentiment'].map({'positive': 1, 'negative': 0})

Remove short text

In [195]:
short_content_count = df_reddit[df_reddit['comment'].str.len() <= 20].shape[0]
print(f"Number of short content rows: {short_content_count}")

Number of short content rows: 2148


In [196]:
df_reddit = df_reddit[df_reddit['comment'].str.len() > 20] 

Rename columns

In [197]:
df_reddit = df_reddit.rename(columns={
    'comment': 'user_review',
    'sentiment': 'user_suggestion'
})

In [198]:
df_reddit

Unnamed: 0,user_review,user_suggestion
0,dont think like game havent even played minute...,0
1,leave farm smaller creature either wait help k...,0
2,nothing beat feeling get see fall love like ye...,1
3,also theyre made paper edit tried make gif fai...,0
4,haha exactly brother tried get wow run raid ge...,1
...,...,...
21807,unless figure make face without dot edit give ...,0
21811,pa love child cousin equally jezzabell,1
21817,human free developed relationship esteemed fellow,1
21819,sorry going edit beauty mark face,1


## 3. Main dataset

In [199]:
df = pd.concat([df_steam, df_reddit], axis=0).reset_index(drop=True)

In [200]:
df

Unnamed: 0,user_review,user_suggestion
0,i'm scared and hearing creepy voices. so i'll ...,1
1,"best game, more better than sam pepper's youtu...",1
2,"a littly iffy on the controls, but once you kn...",1
3,"great game, fun and colorful and all that.a si...",1
4,not many games have the cute tag right next to...,1
...,...,...
28719,unless figure make face without dot edit give ...,0
28720,pa love child cousin equally jezzabell,1
28721,human free developed relationship esteemed fellow,1
28722,sorry going edit beauty mark face,1


In [201]:
df.to_csv('review_preprocessed.csv')

In [202]:
df['user_suggestion'].value_counts()

user_suggestion
1    17888
0    10836
Name: count, dtype: int64

Split train + valid + test (8:1:1)

In [203]:
# First split: 80% train, 20% temp (val + test)
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['user_suggestion']  
)

# Second split: 10% val, 10% test (from the 20% temp)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df['user_suggestion']
)

In [204]:
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')
val_df.to_csv('val.csv')