# Part 1

In [1]:
import pandas as pd
import numpy as np

# Load main dataset
df = pd.read_csv("main.csv")
print(f"Total reviews: {len(df)}")
df.head(3)

Total reviews: 40000


Unnamed: 0,user,playtime,post_date,helpfulness,review_text,recommend,early_access_review,appid,game_name,release_date,genres
0,Pakistan warrior,47.8,"November 3, 2023",3911,ALT + F4 best feature in the game 10/10,True,,1938090,Call of Duty®,"Oct 27, 2022",Action
1,Zuvi,1969.8,"November 2, 2022",3154,"SPAWN DIE, SPAWN DIE, SPAWN DIE, SPAWN DIE.-Jev",True,,1938090,Call of Duty®,"Oct 27, 2022",Action
2,SƎXSƎN,1190.7,"August 5, 2023",2821,"My wife said if this review gets 100 likes, I ...",True,,1938090,Call of Duty®,"Oct 27, 2022",Action


## Task 1.1 – Pseudo-Labels (Question 1)

In [None]:
# Token length = number of tokens (whitespace-split words) per review
def token_length(text):
    if pd.isna(text):
        return 0
    return len(str(text).split())

df["token_length"] = df["review_text"].apply(token_length)
lengths = df["token_length"]

q25 = lengths.quantile(0.25)
q75 = lengths.quantile(0.75)
print(f"Q25 (25th percentile): {q25:.0f} tokens")
print(f"Q75 (75th percentile): {q75:.0f} tokens")

# Retain only Short (<= q25) or Long (>= q75)
short_mask = lengths <= q25
long_mask = lengths >= q75
df_task1 = df[short_mask | long_mask].copy()
df_task1["pseudo_label"] = np.where(df_task1["token_length"] <= q25, "Short", "Long")

n_retained = len(df_task1)
n_short = (df_task1["pseudo_label"] == "Short").sum()
n_long = (df_task1["pseudo_label"] == "Long").sum()
avg_short = df_task1.loc[df_task1["pseudo_label"] == "Short", "token_length"].mean()
avg_long = df_task1.loc[df_task1["pseudo_label"] == "Long", "token_length"].mean()

print("\n--- Question 1 Report ---")
print(f"Number of reviews retained: {n_retained} (Short: {n_short}, Long: {n_long})")
print(f"Average token length - Short reviews: {avg_short:.2f}")
print(f"Average token length - Long reviews:  {avg_long:.2f}")