# Keyword detection + analysis in reviews

In [60]:
# Imports
from keywordExtraction import KeyWordExtractor
from os import walk
import pandas as pd
import matplotlib.pyplot as plt

Fetching all datasets we have --- For now just using one: Edit this later

In [61]:
dataset_dir = "../review_dataset/"

files = []
set_paths = []
categories = []
for (dirpath, dirnames, filenames) in walk(dataset_dir):
    for file in filenames:
        set_paths.append(dirpath +"/"+ file)

print(set_paths)
print("\n----> TOTAL: ", len(set_paths), "sets")

['../review_dataset/Entertainment/max-stream-hbo-tv-movies.csv', '../review_dataset/Entertainment/Netflix.csv', '../review_dataset/Entertainment/ticketmaster-buy-sell-tickets.csv', '../review_dataset/Finance/cash-app.csv', '../review_dataset/Finance/paypal-send-shop-manage.csv', '../review_dataset/Finance/venmo.csv', '../review_dataset/Games/among-us.csv', '../review_dataset/Games/clash-of-clans.csv', '../review_dataset/Games/monopoly-go.csv', '../review_dataset/Health/bettersleep-relax-and-sleep.csv', '../review_dataset/Health/planet-fitness-workouts.csv', '../review_dataset/Health/yuka-foo-cosmetic-scanner.csv']

----> TOTAL:  12 sets


In [86]:
current_set = pd.read_csv(set_paths[6]) ## currently just using the first one

## Exploring the set -- and performing EDA + cleaning if any

In [87]:
current_set.head()

Unnamed: 0,Score,Date,Content,Title
0,4,2023-01-22 16:20:42,This is really fun I do not even hate it one b...,Fun!🌈💎✨❤️🌼🦄🧸🥰
1,5,2022-12-14 23:38:14,Hi! So I love the game! I love the detail ad...,Reasons of Among Us
2,5,2023-05-25 02:44:13,So I love among us but I have ideas for it so ...,Cool ideas for among us
3,3,2021-07-17 23:13:48,I used to bolt to my phone once I had the oppo...,The Updates
4,5,2023-07-16 17:06:38,Ok so I love this game soooooo much!!! but eve...,Awesome and really addicting


In [88]:
null_mask = current_set.isnull().any(axis=1)
null_rows = current_set[null_mask]
current_set = current_set[null_mask == False]
print("Rows with null values in any col:", len(null_rows))

### We have a clean dataset

Rows with null values in any col: 0


#### Getting summary stats -- and cleaning duplicates

In [89]:
print("TOTAL REVIWS:", current_set.count()["Content"])
print("UNIQUE REVIEWS:", len(current_set["Content"].unique()))
current_set = current_set[current_set["Content"].duplicated() == False]
print(current_set.dtypes)
current_set.head()

TOTAL REVIWS: 10000
UNIQUE REVIEWS: 10000
Score       int64
Date       object
Content    object
Title      object
dtype: object


Unnamed: 0,Score,Date,Content,Title
0,4,2023-01-22 16:20:42,This is really fun I do not even hate it one b...,Fun!🌈💎✨❤️🌼🦄🧸🥰
1,5,2022-12-14 23:38:14,Hi! So I love the game! I love the detail ad...,Reasons of Among Us
2,5,2023-05-25 02:44:13,So I love among us but I have ideas for it so ...,Cool ideas for among us
3,3,2021-07-17 23:13:48,I used to bolt to my phone once I had the oppo...,The Updates
4,5,2023-07-16 17:06:38,Ok so I love this game soooooo much!!! but eve...,Awesome and really addicting


#### Cleaning review content + converting to datetime objects

In [90]:
current_set["Content"] = current_set["Content"].str.lower()
current_set["Content"] = current_set["Content"].str.strip()
current_set["Content"] = current_set["Content"].str.replace("’", "'")
current_set['Date'] = pd.to_datetime(current_set['Date'], format='%Y-%m-%d %H:%M:%S')
print("MAX DATE:", current_set['Date'].max())
print("MIN DATE:", current_set['Date'].min())
print(current_set.dtypes)
current_set

MAX DATE: 2023-11-11 19:58:40
MIN DATE: 2018-10-30 14:51:06
Score               int64
Date       datetime64[ns]
Content            object
Title              object
dtype: object


Unnamed: 0,Score,Date,Content,Title
0,4,2023-01-22 16:20:42,this is really fun i do not even hate it one b...,Fun!🌈💎✨❤️🌼🦄🧸🥰
1,5,2022-12-14 23:38:14,hi! so i love the game! i love the detail ad...,Reasons of Among Us
2,5,2023-05-25 02:44:13,so i love among us but i have ideas for it so ...,Cool ideas for among us
3,3,2021-07-17 23:13:48,i used to bolt to my phone once i had the oppo...,The Updates
4,5,2023-07-16 17:06:38,ok so i love this game soooooo much!!! but eve...,Awesome and really addicting
...,...,...,...,...
9995,3,2020-12-25 02:22:21,i really enjoy playing among us! it's so creat...,Good but a few kinks
9996,5,2020-12-23 16:36:08,i love this game me and my brother like to pla...,New colors or modes
9997,5,2020-12-23 15:29:17,hi there who ever is reading this! i love this...,Amazing! But some problems.
9998,3,2020-12-23 14:52:20,hello i have had this game for about a month a...,Somethings I like and dislike


In [94]:
#### one function to clean df
def clean_dataset(df):
    null_mask = df.isnull().any(axis=1)
    null_rows = df[null_mask]
    df = df[null_mask == False]
    print("Rows with null values in any col:", len(null_rows))
    
    print("TOTAL REVIWS:", df.count()["Content"])
    print("UNIQUE REVIEWS:", len(df["Content"].unique()))
    df = df[df["Content"].duplicated() == False]

    df["Content"] = df["Content"].str.lower()
    df["Content"] = df["Content"].str.strip()
    df["Content"] = df["Content"].str.replace("’", "'")
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H:%M:%S')
    print("MAX DATE:", df['Date'].max())
    print("MIN DATE:", df['Date'].min())
    print("UPDATED DATATYPES:")
    print(df.dtypes)
    
    return df
    

#### Keyword Frequency -- total

In [20]:
########### EXTRACTING KEYWORDS AND COUNTING #############
def getTopKeywords(raw_reviews):
    extractor = KeyWordExtractor("")

    keywords_count = {}
    for review in raw_reviews:
        review = review.strip()
        if not len(review):
            continue
        extractor.setText(review)
        keywords = extractor.yakeExtract()

        for kw, score in keywords:
            if kw in keywords_count:
                keywords_count[kw] += 1
            else:
                keywords_count[kw] = 1
    
    print("...Sorting")
    tuples = zip(keywords_count.keys(), keywords_count.values())
    sorted_by_second = sorted(tuples, reverse=True, key=lambda tup: tup[1])
    return sorted_by_second


### getting keywords and their counts as list of tuples --- DISABLED FOR NOW
# reviews = list(current_set["Content"])
# print("TOTAL REVIEWS:", len(reviews))
# keywords_tuples = getTopKeywords(reviews)
# for kw, count in keywords_tuples:
#     print(kw, count)

TOTAL REVIEWS: 9307
...Sorting
app 3483
max 2600
hbo 2021
hbo max 1326
shows 1253
watch 990
movies 930
love 771
show 736
time 712
watching 673
great 621
back 618
good 596
content 565
streaming 549
episode 468
max app 451
movie 399
work 381
change 377
service 345
fix 335
phone 312
screen 295
download 295
update 294
subscription 256
continue 255
hbo max app 244
make 241
find 240
series 240
continue watching 239
annoying 238
sign 228
fine 215
lot 207
thing 206
play 206
issues 204
apps 202
apple 200
works 195
error 195
ads 194
favorite 193
episodes 192
log 191
hbomax 176
start 174
terrible 173
ipad 170
netflix 166
audio 166
pay 165
issue 163
amazing 163
streaming service 159
left 156
bring 156
made 153
account 151
times 151
quality 148
discovery 147
reason 147
shareplay 146
cast 145
add 145
downloaded 144
option 143
watched 143
things 141
provider 141
stuff 141
worse 139
day 138
open 137
selection 136
login 136
platform 134
video 130
worked 128
problem 128
enjoy 127
bad 127
changed 127
peo

In [21]:
# keyword_freq_1gram =  dict((x, y) for x, y in keywords_tuples if " " not in x)
# keyword_freq_2gram = dict((x, y) for x, y in keywords_tuples if " " in x) # phrases
# print(keyword_freq_1gram)
# print()
# print(keyword_freq_2gram)




Run through these, identify interesting terms manually: \
-> for AMAZON
- suspicious
- glitches
- money
- scammed
- customer service
- return
- made in china

#### Checking summary for each data set -- Grouping temporally

In [93]:
for data_set in set_paths:
    print("READING:", data_set)
    try:
        this_set = pd.read_csv(data_set)
    except:
        print("Failed to read:", data_set)
    
    this_set = clean_dataset(this_set)
    
    df_year = this_set.copy()
    df_year['Date'] = pd.to_datetime(df_year["Date"].dt.strftime('%Y'))
    df_year = (df_year.groupby(['Date'])).count()
    print(df_year)

READING: ../review_dataset/Entertainment/max-stream-hbo-tv-movies.csv
Rows with null values in any col: 0
TOTAL REVIWS: 10010
UNIQUE REVIEWS: 9307
DATA TYPES:
Score       int64
Date       object
Content    object
Title      object
dtype: object
MAX DATE: 2023-11-12 08:23:43
MIN DATE: 2023-05-23 03:09:22
UPDATED DATATYPES:
Score               int64
Date       datetime64[ns]
Content            object
Title              object
dtype: object
            Score  Content  Title
Date                             
2023-01-01   9307     9307   9307
READING: ../review_dataset/Entertainment/Netflix.csv
Failed to read: ../review_dataset/Entertainment/Netflix.csv
Rows with null values in any col: 0
TOTAL REVIWS: 9307
UNIQUE REVIEWS: 9277
DATA TYPES:
Score               int64
Date       datetime64[ns]
Content            object
Title              object
dtype: object
MAX DATE: 2023-11-12 08:23:43
MIN DATE: 2023-05-23 03:09:22
UPDATED DATATYPES:
Score               int64
Date       datetime64[ns]
Conten