In [2]:
import pandas as pd

# Load metadata and reviews
apps = pd.read_csv("../data/googleplaystore.csv")
reviews = pd.read_csv("../data/googleplaystore_user_reviews.csv")

# Clean: drop rows with missing values in important columns
reviews = reviews.dropna(subset=["Translated_Review", "Sentiment"])
reviews = reviews[reviews["Sentiment"].isin(["Positive", "Negative", "Neutral"])]

# Get health apps
health_apps = apps[apps["Category"] == "HEALTH_AND_FITNESS"]["App"].unique()

# Filter reviews to only include those from health apps
health_reviews = reviews[reviews["App"].isin(health_apps)]

# Reset index
health_reviews = health_reviews.reset_index(drop=True)

# Optional: check result
print(health_reviews["Sentiment"].value_counts())
print(health_reviews.sample(5)[["App", "Translated_Review", "Sentiment"]])


Sentiment
Positive    1754
Negative     257
Neutral      238
Name: count, dtype: int64
                                                  App  \
1939  Fooducate Healthy Weight Loss & Calorie Counter   
1673                    Down Dog: Great Yoga Anywhere   
4                               10 Best Foods for You   
1249                   Calorie Counter - MyFitnessPal   
867                    Calorie Counter & Diet Tracker   

                                      Translated_Review Sentiment  
1939                       Love explains product rating  Positive  
1673  This changed life saved tired back.. I can't a...  Positive  
4                                              Best way  Positive  
1249  I love app. I Samsung phone naturally I also S...  Positive  
867   The pop-up ads beyond annoying. Also I'd reall...  Negative  


In [5]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder and fit to sentiment
le = LabelEncoder()
health_reviews["label"] = le.fit_transform(health_reviews["Sentiment"])

# Map: Positive → 2, Negative → 0, Neutral → 1
print(dict(zip(le.classes_, le.transform(le.classes_))))


{'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}


In [4]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)      # Remove numbers
    return text

health_reviews["clean_review"] = health_reviews["Translated_Review"].apply(clean_text)


In [None]:
from sklearn.model_selection import train_test_split

X = health_reviews["Translated_Review"]       # or use "clean_review"
y = health_reviews["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")
