In [None]:
!pip install google_play_scraper



In [None]:
from google_play_scraper import reviews, Sort
import pandas as pd

In [None]:
apps = {
    "Amazon": "com.amazon.mShop.android.shopping",
    "Shein": "com.zzkko",
    "AliExpress": "com.alibaba.aliexpresshd",
    "Noon": "com.noon.buyerapp",
    "Trendyol": "trendyol.com"
}


In [None]:
def label_sentiment(rating):
    if rating <= 2:
        return "Negative"
    elif rating == 3:
        return "Neutral"
    else:
        return "Positive"

In [None]:
all_reviews = []

for app_name, app_id in apps.items():
    print(f"Fetching reviews for {app_name}...")

    result, _ = reviews(
        app_id,
        lang="en",
        country="us",
        sort=Sort.NEWEST,
        count=3000
    )

    for r in result:
        all_reviews.append({
            "app_name": app_name,
            "review_text": r["content"],
            "rating": r["score"],
            "sentiment_label": label_sentiment(r["score"])
        })

Fetching reviews for Amazon...
Fetching reviews for Shein...
Fetching reviews for AliExpress...
Fetching reviews for Noon...
Fetching reviews for Trendyol...


In [None]:
df = pd.DataFrame(all_reviews)
df.head()


Unnamed: 0,app_name,review_text,rating,sentiment_label
0,Amazon,love the app used the website for years but th...,5,Positive
1,Amazon,I've been using Amazon since I was a little ki...,5,Positive
2,Amazon,like Amazon pretty well,4,Positive
3,Amazon,my order shows delivered but there's nothing a...,1,Negative
4,Amazon,best shopping app for buy overseas products,5,Positive


In [None]:
import re



def clean_text(text):

    text = re.sub(r"http\S+", "", str(text))

    text = re.sub(r"[^a-zA-Z\s]", "", text)

    text = text.lower()

    return text



df["clean_review"] = df["review_text"].apply(clean_text)

In [None]:
df.to_csv("google_play_app_reviews.csv", index=False)

In [None]:
df[["review_text", "clean_review"]].head(10)

Unnamed: 0,review_text,clean_review
0,love the app used the website for years but th...,love the app used the website for years but th...
1,I've been using Amazon since I was a little ki...,ive been using amazon since i was a little kid...
2,like Amazon pretty well,like amazon pretty well
3,my order shows delivered but there's nothing a...,my order shows delivered but theres nothing at...
4,best shopping app for buy overseas products,best shopping app for buy overseas products
5,Gets majority of what I want/need easier.,gets majority of what i wantneed easier
6,بعتولي منتج غلط والفلوس مبترجعش لازم تشتري بيه...,
7,"Efficient and user-friendly, for the most part...",efficient and userfriendly for the most part w...
8,awful,awful
9,have kpop things,have kpop things


In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
df["Tokens"]= df["clean_review"].apply(word_tokenize)
df[[ "clean_review","Tokens"]].head(5)

Unnamed: 0,clean_review,Tokens
0,love the app used the website for years but th...,"[love, the, app, used, the, website, for, year..."
1,ive been using amazon since i was a little kid...,"[ive, been, using, amazon, since, i, was, a, l..."
2,like amazon pretty well,"[like, amazon, pretty, well]"
3,my order shows delivered but theres nothing at...,"[my, order, shows, delivered, but, theres, not..."
4,best shopping app for buy overseas products,"[best, shopping, app, for, buy, overseas, prod..."


In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

r
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df["Tokens_No_Stopwords"] = df["Tokens"].apply(remove_stopwords)


def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df["Tokens_No_Stopwords_Lemmatized"] = df["Tokens_No_Stopwords"].apply(lemmatize_tokens)


df[["Tokens", "Tokens_No_Stopwords", "Tokens_No_Stopwords_Lemmatized"]].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,Tokens,Tokens_No_Stopwords,Tokens_No_Stopwords_Lemmatized
0,"[love, the, app, used, the, website, for, year...","[love, app, used, website, years, app, much, s...","[love, app, used, website, year, app, much, si..."
1,"[ive, been, using, amazon, since, i, was, a, l...","[ive, using, amazon, since, little, kid, love,...","[ive, using, amazon, since, little, kid, love,..."
2,"[like, amazon, pretty, well]","[like, amazon, pretty, well]","[like, amazon, pretty, well]"
3,"[my, order, shows, delivered, but, theres, not...","[order, shows, delivered, theres, nothing, doo...","[order, show, delivered, there, nothing, doors..."
4,"[best, shopping, app, for, buy, overseas, prod...","[best, shopping, app, buy, overseas, products]","[best, shopping, app, buy, overseas, product]"


In [None]:
df['tokens_str'] = df['Tokens_No_Stopwords_Lemmatized'].apply(lambda x: ' '.join(x))
df = df.drop_duplicates(subset='tokens_str', keep='first')

In [None]:
df = df.drop(columns=['tokens_str'])

In [None]:
df.to_csv("google_play_app_reviews.csv", index=False)

In [None]:
from google.colab import files
files.download("google_play_app_reviews.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

df['final_text'] = df['Tokens_No_Stopwords_Lemmatized'].apply(lambda x: ''.join(x))
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['final_text'])
print(type(X))
y = df['sentiment_label']

<class 'scipy.sparse._csr.csr_matrix'>


In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# BoW
bow_vectorizer = CountVectorizer(max_features=5000)

X_bow = bow_vectorizer.fit_transform(df["final_text"])
y = df["sentiment_label"]

# Split
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    X_bow,y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("BoW Shape:", X_bow.shape)

BoW Shape: (10190, 5000)
