### Step 1 Obatain and process data

In [1]:
import pandas as pd

In [2]:
file_path="https://huggingface.co/api/datasets/amazon_us_reviews/parquet/Music_v1_00/train/4.parquet"

In [3]:
df=pd.read_parquet(file_path)

In [4]:
# Size of original dataset 
len(df)

441000

##### We will use 100,000 rows of data for this project

In [5]:
df=df[:100000]

In [6]:
df.columns #check which columns wont be needed

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='object')

In [7]:
df.head(5)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,49094403,R6W7O55HFSPNK,B00004T9W4,609783476,Teaser And The Firecat (Remastered),Music,5,6,6,0,1,"Wonderful Music, Wonderful Memories",The music of Cat Stevens inspired my college d...,2007-01-04
1,US,32977150,RU8NU5PYIHSQP,B0009299LU,682021445,Rock Of Ages: The Definitive Collection [2 CD],Music,5,3,4,0,1,This band is the best!,Love these guys and hope they harmonize my lif...,2007-01-04
2,US,50412145,R1YSZAEB9NHE4D,B000HDR7OW,208493278,Rest of My Life/Dont Fall Asleep in the Subwa...,Music,5,0,0,0,1,"Buy it, all your friends did, do it.",Less Than Jake deliver another new B-side. Com...,2007-01-04
3,US,40118896,RPICJ3NV6D6Q9,B000HC2PDQ,81891824,Face the Promise (CD+DVD),Music,5,0,0,0,1,Bob Seger - Face the Promise,I have loved Bob Seger since the 70's! This n...,2007-01-04
4,US,34491626,RFZ43852RSUPR,B00008PX90,604640879,Essential Shawn Mullins,Music,4,0,0,0,1,A must have in your collection,This is a great CD. There are at least 7 or 8...,2007-01-04


In [8]:
df['marketplace'].unique()

array(['US'], dtype=object)

In [9]:
# Keep necessary columns
columns_to_delete = ['marketplace', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_date']

df.drop(columns=columns_to_delete, inplace=True)

df.head(5)

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,review_headline,review_body
0,49094403,R6W7O55HFSPNK,B00004T9W4,609783476,Teaser And The Firecat (Remastered),"Wonderful Music, Wonderful Memories",The music of Cat Stevens inspired my college d...
1,32977150,RU8NU5PYIHSQP,B0009299LU,682021445,Rock Of Ages: The Definitive Collection [2 CD],This band is the best!,Love these guys and hope they harmonize my lif...
2,50412145,R1YSZAEB9NHE4D,B000HDR7OW,208493278,Rest of My Life/Dont Fall Asleep in the Subwa...,"Buy it, all your friends did, do it.",Less Than Jake deliver another new B-side. Com...
3,40118896,RPICJ3NV6D6Q9,B000HC2PDQ,81891824,Face the Promise (CD+DVD),Bob Seger - Face the Promise,I have loved Bob Seger since the 70's! This n...
4,34491626,RFZ43852RSUPR,B00008PX90,604640879,Essential Shawn Mullins,A must have in your collection,This is a great CD. There are at least 7 or 8...


In [10]:
product_title=df['product_title'].unique() # How many distinct products
len(product_title)

46607

In [11]:
len(df['product_id'].unique()) # Since the output will be recommended music title so we will use product_title instead of id going forward

51084

### Step 2 Sentiment Analysis using textblob

In [12]:
!pip3 install textblob



In [13]:
from textblob import TextBlob

In [14]:
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

In [15]:
df['sentiment'] = df['review_body'].apply(get_sentiment) # Add sentiment column to store sentiment score

In [16]:
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'positive' if x > 0.3 else 'negative' if x < 0.3 else 'neutral') # Define sentiment label of each row

In [17]:
df[df['sentiment_label']=='negative']

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,review_headline,review_body,sentiment,sentiment_label
2,50412145,R1YSZAEB9NHE4D,B000HDR7OW,208493278,Rest of My Life/Dont Fall Asleep in the Subwa...,"Buy it, all your friends did, do it.",Less Than Jake deliver another new B-side. Com...,-0.019394,negative
6,36312418,R1XDJ4V0MDZUCM,B000GRTQX4,605685678,My Fair Lady (20th Anniversary Broadway Cast),Still Loverly,The only reason I purchased this recording was...,0.119974,negative
7,47765663,R2N08FPRCHRM85,B000006SW2,557411594,The Complete Beyond The Fringe (1961 Original ...,What could have been,Such a shame this comedy troupe broke up. Humo...,0.000000,negative
9,44056578,R2CNMCDTJH1OQF,B000HDRARQ,919993860,Living Like A Refugee,A music Lovers Dream,This CD is unlike any other! It was compiled a...,0.037109,negative
12,24024980,R335FOIS77XNXO,B000I5X82O,624410935,An Other Cup,An other and an other and an other....,I could keep have an other cup of this music p...,0.193750,negative
...,...,...,...,...,...,...,...,...,...
99994,47461597,R1J3DHYRXGGWMN,B000A6T2EY,469022814,The Essential Artie Shaw,"Great music, great remastering, bad track order","First off, the bad...The tracks are not in chr...",0.164680,negative
99995,27221470,R58SI22TLEGWD,B000EPR7NE,501078419,The Da Vinci Code,Da Vinci Code Soundtrack,This is the best soundtrack I have heard in ye...,0.217222,negative
99997,52569116,R16LXICAXWMVYV,B00065TZUU,496394500,Everything You've Heard Is True,Uneven but interesting solo album from former ...,Tom Johnston's first solo album has a strong R...,0.237549,negative
99998,13617020,R1WSDDZP22W4CR,B0000253X1,278426851,Superlife,Track listing,1. Believe <br />2. Overdue <br />3. Too Littl...,-0.122917,negative


In [18]:
df.to_csv('sentiment_data.csv',index=False)

### Filter out overall positive feedback music

In [19]:
# Count the negative neutral positive reviews of each music
sentiment_counts=df.groupby('product_title')['sentiment_label'].value_counts().unstack(fill_value=0).sort_values(by='positive',ascending=False)

In [20]:
sentiment_counts

sentiment_label,negative,neutral,positive
product_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A Thousand Different Ways,182,0,180
Daughtry,129,3,140
Taylor Hicks,72,1,95
Modern Times,177,0,90
20 Y.O.,157,1,87
...,...,...,...
Jam on Me,1,0,0
Jam Track Learn To Play Country Guitar Cd,1,0,0
Jam Session at the Tower,1,0,0
Jake in a Box: Emi Recordings 1967-1976,1,0,0


In [21]:
# Filter the music with only the most positive feedback to a new list
overall_positive = []
overall_negative=[]
for title in df['product_title'].unique():
    positive_count = sentiment_counts.loc[title, 'positive']
    negative_count = sentiment_counts.loc[title, 'negative']
    neutral_count = sentiment_counts.loc[title, 'neutral']
    if positive_count == max(positive_count,negative_count,neutral_count):
        overall_positive.append(title)
    else:
        overall_negative.append(title)

print(len(overall_positive))
print(len(overall_negative))

17105
29502


In [22]:
# Finalize the filtered dataframe with only music with positive feedbacks
filtered_df = df[(df['product_title'].isin(overall_positive)) & (df['sentiment_label'] == 'positive')]
len(filtered_df)


23497

In [23]:
len(filtered_df['product_title'].unique()) # Size of distinct music remained

17105

In [24]:
filtered_df['review_headline']

0        Wonderful Music, Wonderful Memories
1                     This band is the best!
3               Bob Seger - Face the Promise
4             A must have in your collection
5                                 Phantastic
                        ...                 
99962                             TRACK LIST
99964                         Kinks live on!
99966                   Quintessential Crims
99969                      Excellent Project
99982            This is Judy Jacob's best!!
Name: review_headline, Length: 23497, dtype: object

### Step 3 Positive reviews summarization with TextRank

In [25]:
# Combine all review body based on their product title
combined_reviews = filtered_df.groupby('product_title')['review_body'].apply(lambda x: '.'.join(x)).reset_index()

In [26]:
combined_reviews[70:90]

Unnamed: 0,product_title,review_body
70,12 Years of Tears - Live,If you like soft cell / Marc Almond then this ...
71,"12"" Essentials: The Seventies",Got it mostly for Bionic Boogies Risky Changes...
72,120 Days of Genitorture,Looks to be the goth / metal band's first CD r...
73,122 Ellsworth,This is the best album I've heard in years. I...
74,13 & God,I can not stop listening to this album. I love...
75,14 Shades Of Grey,"13 tracks from Staind and only 3 are any good,..."
76,1492 - Conquest of Paradise,I first heard this music while watching a demo...
77,15 Crucial Reggae Cuts,Great Raggae CD here is the list of music <br...
78,15 Exitos,I loved this CD. There were a couple of the so...
79,15 Exitos De,On this CD you not only get some great tracks ...


In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
import pyspark.pandas as ps
from sumy.summarizers.text_rank import TextRankSummarizer

In [36]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("AppName") \
    .getOrCreate()
    
# Convert pandas DataFrame column to Spark DataFrame
spark_df = spark.createDataFrame(combined_reviews[['review_body']], schema=["sentence"])

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [37]:
# Use textrank to summarize the reviewbody

def summarize_textrank(reviews):
    NUM_SENTS = 1
    summary = str()
    parser = PlaintextParser.from_string(reviews, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    sentences = summarizer(parser.document, NUM_SENTS)  # Summarize the document with 1 sentences
    for sentence in sentences:
        summary += (sentence.__unicode__())
    return summary

In [38]:
summarize_reviews_udf = udf(summarize_textrank, StringType())

In [39]:
# Add a new column with summarized reviews
spark_df=spark_df.withColumn("summary", summarize_reviews_udf(spark_df["sentence"]))

In [40]:
spark_df.show()

23/08/10 16:26:53 WARN TaskSetManager: Stage 2 contains a task of very large size (1502 KiB). The maximum recommended task size is 1000 KiB.


[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|            sentence|             summary|
+--------------------+--------------------+
|If you're a fight...|If you're a fight...|
|I liked this cd. ...|Its a good compil...|
|I love his voice!...|Trust me when you...|
|Good music for ba...|Good music for ba...|
|Indie label 4 men...|many will want to...|
|I purchased the V...|If you're looking...|
|This is the best ...|This is the best ...|
|A spectacular alb...|Tommy Shaw has ne...|
|AM A GREAT BELLAM...|AM A GREAT BELLAM...|
|GRAB THIS ONE IF ...|It was surprising...|
|This is a great p...|At times, she wou...|
|This album is jus...|<br /> <br />Ther...|
|This is one of Ch...|This is one of Ch...|
|First off, let me...|<br />There first...|
|as with all music...|the good is reall...|
|Although it does ...|Although it does ...|
|This boxed set is...|I love 80's music...|
|I bought this CD ...|I think it is one...|
|This CD is remast...|This CD is remast...|
|This collection w...|This is ou

                                                                                

In [41]:
summary=spark_df.toPandas()

23/08/10 16:26:55 WARN TaskSetManager: Stage 3 contains a task of very large size (1502 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [42]:
summary

Unnamed: 0,sentence,summary
0,"If you're a fighter pilot, are with a fighter ...","If you're a fighter pilot, are with a fighter ..."
1,I liked this cd. Its a good compilation of son...,"Its a good compilation of songs, however I wis..."
2,I love his voice!!!!! My boyfriend knows that ...,"Trust me when you hear JP sing his OWN music, ..."
3,Good music for ballroom dancing -- both for th...,Good music for ballroom dancing -- both for th...
4,Indie label 4 men with Beards gets permission ...,many will want to get the turn table out of mo...
...,...,...
17100,"its like Tom Petty, remixed by David Lynch. <b...","<br /> <br />1) \\""la news\\"" is a great start..."
17101,"Very good studio moe. Plane crash, Happy Hour...","Plane crash, Happy Hour Hero, Nebraska are som..."
17102,"Sorry for the mixed metaphor of a title, but M...",Spear is one of reggae's all-time most importa...
17103,Compare to the others Estopa CD's this is not ...,Compare to the others Estopa CD's this is not ...


In [43]:
combined_reviews['summary']=summary['summary']

In [44]:
print(len(combined_reviews['review_body'][7]))
print(len(combined_reviews['summary'][7]))


3542
177


In [45]:
combined_reviews

Unnamed: 0,product_title,review_body,summary
0,"""2""","If you're a fighter pilot, are with a fighter ...","If you're a fighter pilot, are with a fighter ..."
1,"""DLG (Dark, Latin Groove) - Greatest Hits""",I liked this cd. Its a good compilation of son...,"Its a good compilation of songs, however I wis..."
2,"""If I Go Away"" / ""Man Like Me""",I love his voice!!!!! My boyfriend knows that ...,"Trust me when you hear JP sing his OWN music, ..."
3,"""John Skinner's Second Complete Ballroom Dance""",Good music for ballroom dancing -- both for th...,Good music for ballroom dancing -- both for th...
4,"""Metal Box 3x 12"""" 45 Rpm Lp in Metal Box""",Indie label 4 men with Beards gets permission ...,many will want to get the turn table out of mo...
...,...,...,...
17100,this is the fire,"its like Tom Petty, remixed by David Lynch. <b...","<br /> <br />1) \\""la news\\"" is a great start..."
17101,tin cans & car tires,"Very good studio moe. Plane crash, Happy Hour...","Plane crash, Happy Hour Hero, Nebraska are som..."
17102,travelling,"Sorry for the mixed metaphor of a title, but M...",Spear is one of reggae's all-time most importa...
17103,¿La Calle Es Tuya?,Compare to the others Estopa CD's this is not ...,Compare to the others Estopa CD's this is not ...


In [46]:
combined_reviews.to_csv('data_w_summaries.csv',index=False)