In [None]:
import pandas as pd
import math
import numpy as np
import nltk
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet as wn
from nltk.corpus import names as nameTypes

%matplotlib inline
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('names')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!


True

In [None]:
end_date = datetime.strptime('09/01/2015', '%m/%d/%Y')
def get_beta(alpha):
  return math.e**(-(1./alpha*math.log(alpha, math.e)))

stop_words = set(stopwords.words('english'))
bad_words = {'aed','oed','eed'}
porter_stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')

df_hair = pd.read_csv('hair_dryer.tsv', sep='\t')
df_microwave = pd.read_csv('microwave.tsv', sep='\t')
df_pacifier = pd.read_csv('pacifier.tsv', sep='\t')
df_hair.replace(np.nan, '', regex=True, inplace=True)
df_microwave.replace(np.nan, '', regex=True, inplace=True)
df_pacifier.replace(np.nan, '', regex=True, inplace=True)

df_all = {'hair_dryer':df_hair, 'microwave':df_microwave, 'pacifier':df_pacifier}

In [None]:
df_hair.iloc[0]

marketplace                                                         US
customer_id                                                   34678741
review_id                                                R9T1FE2ZX2X04
product_id                                                  B003V264WW
product_parent                                               732252283
product_title        remington ac2015 t|studio salon collection pea...
product_category                                                Beauty
star_rating                                                          5
helpful_votes                                                        0
total_votes                                                          0
vine                                                                 N
verified_purchase                                                    Y
review_headline                                            Works great
review_body                                               Works great!
review

In [None]:
## part a functions
def informitivity(length, num_help, num_unhelp, time, verified, vine,
                  alpha, beta, gamma, delta):
  return length * math.log(num_help/time + alpha, alpha) * \
         (beta ** (num_unhelp/time)) * (gamma ** verified) * \
         (delta ** vine)

def importance(data, alpha, beta, gamma, delta):
  reviews = {}

  for key in data.index:
    length = len(data['review_body'][key])
    num_help = data['helpful_votes'][key]
    num_unhelp = data['total_votes'][key] - num_help
    time = (end_date - datetime.strptime(data['review_date'][key], '%m/%d/%Y')).days/30
    if time < 1:
      time = 1
    verified = 1 if data['verified_purchase'][key] is 'Y' else 0
    vine = 1 if data['vine'][key] is 'Y' else 0
    reviews[key] = informitivity(length, num_help, num_unhelp, time, verified, 
                                 vine, alpha, beta, gamma, delta)
  
  # get the most important 50 reviews by criteria
  return [(k, reviews[k]) for k in sorted(reviews, key=reviews.get, reverse=True)]

In [None]:
## part a execution
alpha = math.e
beta = get_beta(alpha)
reviewSave = []
for key in df_all:
  reviews = importance(df_all[key], alpha, beta, 1.1, 5)
  rank = 1
  for product, score in reviews[:3]:
    #print(f"Review of type {key} and rank {rank} has score: {score}")
    #print(df_all[key].iloc[product])
    reviewSave.append([key, product, rank, score] + list(df_all[key].iloc[product].values))
    rank += 1
reviewPD = pd.DataFrame(reviewSave, columns = ['dataset', 'product_id', 'rank', 'score']
                        + list(df_hair.keys()))
reviewPD.to_csv('partA.csv') 

In [None]:
## part a criteria
alphas = [2, math.e, 3]
betas = [get_beta(a) for a in alphas]
total_test = 100

top_same = []
for key in df_all:
  reviews = importance(df_all[key], alpha, beta, 1.1, 5)
  same_set = set()
  for i in range(len(alphas)):
    reviews = importance(df_all[key], alphas[i], betas[i], 1.1, 5)
    curr_set = set([p[0] for p in reviews[:total_test]])
    if len(same_set) is 0:
      same_set = same_set.union(curr_set)
    else:
      same_set = same_set.intersection(curr_set)
  top_same.append(len(same_set)/total_test)

print(f'hair dryer dataset top review stability: {top_same[0]*100}%')
print(f'microwave dataset top review stability: {top_same[1]*100}%')
print(f'pacifier dataset top review stability: {top_same[2]*100}%')

In [None]:
## part b definition
def reputation(rating, num_help, num_unhelp, time, verified, vine,
                  alpha, beta, gamma, delta):
  return (rating - 3) * math.log(num_help/time + alpha, alpha) * \
         (beta ** (num_unhelp/time)) * (gamma ** verified) * \
         (delta ** vine)

def trend(data, alpha, beta, gamma, delta, diff_weight, dataset):
  # calculate total reputation of all products
  products = {}
  product_start = {}
  product_end = {}
  scores = {}
  length = {}
  total_rep = {}
  
  # calculate total reputation of all products
  for key in data.index:
    productID = data['product_id'][key]
    if productID not in products:
      products[productID] = 0
      product_end[productID] = key
    product_start[productID] = key
    rating = data['star_rating'][key]
    num_help = data['helpful_votes'][key]
    num_unhelp = data['total_votes'][key] - num_help
    verified = 1 if data['verified_purchase'][key] is 'Y' else 0
    vine = 1 if data['vine'][key] is 'Y' else 0
    time = (end_date - datetime.strptime(data['review_date'][key], '%m/%d/%Y')).days
    score = reputation(rating, num_help, num_unhelp, time, verified, vine,
                       alpha, beta, gamma, delta)
    products[productID] += score
    scores[key] = score
    
  # extract actual date from start and end
  for pid in products:
    product_start[pid] = datetime.strptime(data['review_date'][product_start[pid]], '%m/%d/%Y')
    product_end[pid] = datetime.strptime(data['review_date'][product_end[pid]], '%m/%d/%Y')
    length[pid] = math.log((end_date - product_start[pid]).days + 1, math.e)
  
  # compute first 30 days average and last 30 days average for all products
  first30 = {}
  last30 = {}
  for key in data.index:
    date = datetime.strptime(data['review_date'][key], '%m/%d/%Y')
    pid = data['product_id'][key]
    if pid not in first30:
      first30[pid] = []
    if pid not in last30:
      last30[pid] = []
    if (date - product_start[pid]).days <= 30:
      first30[pid].append(scores[key])
    if (product_end[pid] - date).days <= 30:
      last30[pid].append(scores[key])
  for pid in products:
    first30[pid] = sum(first30[pid]) / len(first30[pid])
    last30[pid] = sum(last30[pid]) / len(last30[pid])

  # compute final reputation of all products
  for pid in products:
    diff_score = (last30[pid] - first30[pid]) / length[pid] * diff_weight
    total_rep[pid] = diff_score + products[pid] / length[pid]

  total_products = len(products)
  sortedreps = [(k, total_rep[k]) for k in sorted(total_rep, key=total_rep.get, reverse=True)]

  # plot trend of products with best reputation, worst reputation,
  # and some product in the middle
  middleRank = 1
  bestKey = sortedreps[0][0]
  worstKey = sortedreps[-1][0]
  middleKey = sortedreps[middleRank][0]
  # pacifier worst key doesn't work
  if dataset == 'pacifier':
    worstKey = sortedreps[-10][0]

  bestTime = []
  bestTrend = []
  worstTime = []
  worstTrend = []
  middleTime = []
  middleTrend = []
  for key in data.index:
    product_id = data['product_id'][key]
    if product_id != bestKey and product_id != worstKey and product_id != middleKey:
      continue
    rating = data['star_rating'][key]
    num_help = data['helpful_votes'][key]
    num_unhelp = data['total_votes'][key] - num_help
    verified = 1 if data['verified_purchase'][key] is 'Y' else 0
    vine = 1 if data['vine'][key] is 'Y' else 0
    score = reputation(rating, num_help, num_unhelp, 1, verified, vine,
                       alpha, beta, gamma, delta)
    if product_id is bestKey:
      bestTime.append(datetime.strptime(data['review_date'][key], '%m/%d/%Y'))
      bestTrend.append(score)
    elif product_id is worstKey:
      worstTime.append(datetime.strptime(data['review_date'][key], '%m/%d/%Y'))
      worstTrend.append(score)
    else:
      middleTime.append(datetime.strptime(data['review_date'][key], '%m/%d/%Y'))
      middleTrend.append(score)
    
  bestTime.reverse()
  bestTrend.reverse()
  worstTime.reverse()
  worstTrend.reverse()
  middleTime.reverse()
  middleTrend.reverse()

  times = []
  trends = []
  # resample all times to average over 30 days
  rate = timedelta(10)
  for time, trend in [(bestTime, bestTrend), (worstTime, worstTrend), (middleTime, middleTrend)]:
    start = time[0]
    currAvg = []
    new_time = []
    new_trend = []
    for i in range(len(time)):
      currAvg.append(trend[i])
      if time[i] - start > rate:
        new_time.append(time[i])
        new_trend.append(sum(currAvg)/len(currAvg))
        start = time[i]
    times.append(new_time)
    trends.append(new_trend)

  return total_rep, times, trends

In [None]:
## part b execution
i = 0
plt.figure(figsize=(50,75))
plt.rcParams.update({'font.size': 25})
for key in df_all:
  for diff_weight in [0,100]:
    # overall reputation and weighted difference
    _, times, trends = trend(df_all[key], alpha, beta, 1.1, 5, diff_weight, key)
    weighted = '' if diff_weight is 0 else ' weighted'

    plt.subplot2grid((6,3), (i,0))
    plt.title(key + weighted + ' First Rank vs reputation')
    #plt.xlabel('timeline')
    plt.ylabel('reputation')
    plt.grid()
    plt.plot(times[0], trends[0])
    plt.subplot2grid((6,3), (i,1))
    plt.title(key + weighted + ' Second Rank vs reputation') 
    #plt.xlabel('timeline')
    plt.ylabel('reputation')
    plt.grid() 
    plt.plot(times[2], trends[2])
    plt.subplot2grid((6,3), (i,2))
    plt.title(key + weighted + ' Worst Rank vs reputation')
    #plt.xlabel('timeline')
    plt.ylabel('reputation')
    plt.grid()
    plt.plot(times[1], trends[1])
    
    i += 1
  
plt.savefig('partB.png')
plt.close()

In [None]:
## part c graph
# target features: 
#   type of people 
#     categorize by gender
male = set(['he', 'his', 'him', 'husband', 'son', 'father', 'dad', 'grandpa', 'grandfather', 'boyfriend'])
female = set(['she', 'her', 'wife', 'daughter', 'mother', 'mom', 'grandma', 'grandmother', 'girlfriend'])
#     categorize by age
young = set(['kid', 'child', 'children', 'son', 'daughter', 'teenager', 'boy', 'girl', 'teen', 'young'])
youngAge = set(range(19))
middle = set(['husband', 'wife', 'boyfriend', 'girlfriend'])
middleAge = set(range(19, 61))
old = set(['father', 'dad', 'grandpa', 'grandfather', 'mother', 'mom', 'grandma', 'grandmother'])
oldAge = set(range(61,100))

# histogram by different rating/gender
# total review number by month from different gender
def freqByGender(data):
  total_review = [[0 for _ in range(12)], [0 for _ in range(12)]]
  boys = [0 for _ in range(5)]
  girls = [0 for _ in range(5)]

  for key in data.index:
    line = data['review_headline'][key] + ' ' + data['review_body'][key]
    rating = data['star_rating'][key]
    month = int(data['review_date'][key].split('/')[0])
    words = tokenizer.tokenize(line.lower())
    for word in words:
      stem = porter_stemmer.stem(word)
      if stem in male:
        boys[rating-1] += 1
        total_review[0][month-1] += 1
        break
    for word in words:
      stem = porter_stemmer.stem(word)
      if stem in female:
        girls[rating-1] += 1
        total_review[1][month-1] += 1
        break

  return total_review, boys, girls

# histogram by different rating/age
# total review number by month from different age
def freqByAge(data):
  total_review = [[0 for _ in range(12)], [0 for _ in range(12)], [0 for _ in range(12)]]
  low = [0 for _ in range(5)]
  center = [0 for _ in range(5)]
  high = [0 for _ in range(5)]

  for key in data.index:
    line = data['review_headline'][key] + ' ' + data['review_body'][key]
    rating = data['star_rating'][key]
    month = int(data['review_date'][key].split('/')[0])
    words = tokenizer.tokenize(line.lower())
    for word in words:
      stem = porter_stemmer.stem(word)
      if stem in young or stem in youngAge:
        low[rating-1] += 1
        total_review[0][month-1] += 1
        break
    for word in words:
      stem = porter_stemmer.stem(word)
      if stem in middle or stem in middleAge:
        center[rating-1] += 1
        total_review[1][month-1] += 1
        break
    for word in words:
      stem = porter_stemmer.stem(word)
      if stem in old or stem in oldAge:
        high[rating-1] += 1
        total_review[2][month-1] += 1
        break

  return total_review, low, center, high

plt.figure(figsize=(50,50))
plt.rcParams.update({'font.size': 25})
ratings = np.arange(1,6)
months = np.arange(1,13)

# plot classify by gender
counter = 0
for key in df_all:
  total_review, boys, girls = freqByGender(df_all[key])
  plt.subplot2grid((4,3), (0,counter))
  plt.bar(ratings-0.1, boys, width=0.2, color='b', align='center')
  plt.bar(ratings+0.1, girls, width=0.2, color='r', align='center')
  plt.legend(['male', 'female'])
  plt.grid()
  plt.xlabel('rating')
  plt.ylabel('number of reviews')
  plt.title(key + ' reviews by gender and rating')
  plt.subplot2grid((4,3), (1,counter))
  plt.bar(months-0.1, total_review[0], width=0.2, color='b', align='center')
  plt.bar(months+0.1, total_review[1], width=0.2, color='r', align='center')
  plt.legend(['male', 'female'])
  plt.grid()
  plt.xlabel('month')
  plt.ylabel('number of reviews')
  plt.title(key + ' reviews by gender and month')
  counter += 1

counter = 0
for key in df_all:
  total_review, low, center, high = freqByAge(df_all[key])
  plt.subplot2grid((4,3), (2,counter))
  plt.bar(ratings-0.2, low, width=0.2, color='b', align='center')
  plt.bar(ratings, center, width=0.2, color='g', align='center')
  plt.bar(ratings+0.2, high, width=0.2, color='r', align='center')
  plt.legend(['young', 'middle age', 'elder'])
  plt.grid()
  plt.xlabel('rating')
  plt.ylabel('number of reviews')
  plt.title(key + ' reviews by age and rating')
  plt.subplot2grid((4,3), (3,counter))
  plt.bar(months-0.2, total_review[0], width=0.2, color='b', align='center')
  plt.bar(months, total_review[1], width=0.2, color='g', align='center')
  plt.bar(months+0.2, total_review[2], width=0.2, color='r', align='center')
  plt.legend(['young', 'middle age', 'elder'])
  plt.grid()
  plt.xlabel('month')
  plt.ylabel('number of reviews')
  plt.title(key + ' reviews by age and month')
  counter += 1

plt.savefig('partC.png')
plt.close()

In [None]:
## part c feature extraction function
def wordFreq(data, alpha, beta, gamma, delta, diff_weight, dataset):
  good_words = {}
  bad_words = {}

  reps, _, _ = trend(data, alpha, beta, gamma, delta, diff_weight, dataset)
  sortedKeys = sorted(reps, key=reps.get, reverse=True)
  goodKeys = sortedKeys[:10]
  badKeys = sortedKeys[-10:]
  
  for key in data.index:
    product_id = data['product_id'][key]
    if product_id not in goodKeys and product_id not in badKeys:
      continue
    line = data['review_headline'][key] + ' ' + data['review_body'][key]
    rating = data['star_rating'][key]
    month = int(data['review_date'][key].split('/')[0])
    words = tokenizer.tokenize(line.lower())
    for word in words:
      if word in stop_words:
        continue
      if product_id in goodKeys:
        if word not in good_words:
          good_words[word] = 0
        good_words[word] += reps[product_id]
      if product_id in badKeys:
        if word not in bad_words:
          bad_words[word] = 0
        bad_words[word] += reps[product_id]

  goodWords = [(k, good_words[k]) for k in sorted(good_words, key=good_words.get, reverse=True)]
  badWords = [(k, bad_words[k]) for k in sorted(bad_words, key=bad_words.get)]
  return goodWords, badWords

In [None]:
## part c feature extraction execution
for key in df_all:  
  goodWords, badWords = wordFreq(df_all[key], alpha, beta, 1.1, 5, 0, key)
  meaningful = goodWords[:100] + badWords[-100:]
  wordPD = pd.DataFrame(meaningful)
  wordPD.to_csv(key+'_wordReps.csv')
for key in df_all:  
  goodWords, badWords = wordFreq(df_all[key], alpha, beta, 1.1, 5, 100, key)
  meaningful = goodWords[:100] + badWords[-100:]
  wordPD = pd.DataFrame(meaningful)
  wordPD.to_csv(key+'_WeightedwordReps.csv')

In [None]:
## part d functions
def mapRating(rating):
  if rating <= 1.8:
    return 1
  elif rating <= 2.6:
    return 2
  elif rating <= 3.4:
    return 3
  elif rating <= 4.2:
    return 4
  else:
    return 5

def ratingTrend(data):
  num_review = {}
  timeline = {}
  ratingAvg = {}
  tempTotal = {}
  tempRating = {}
  
  # get averaged number of reviews in 30-day time intervals
  for key in data.index:
    pid = data['product_id'][key]
    date = datetime.strptime(data['review_date'][key], '%m/%d/%Y')
    rating = data['star_rating'][key]
    if pid not in num_review:
      num_review[pid] = []
      timeline[pid] = []
      timeline[pid].append(date)
    if pid not in tempTotal:
      tempTotal[pid] = 0
    if pid not in ratingAvg:
      ratingAvg[pid] = []
    if pid not in tempRating:
      tempRating[pid] = []
    if (timeline[pid][-1] - date).days > 30:
      num_review[pid].append(tempTotal[pid])
      timeline[pid].append(date)
      ratingAvg[pid].append(sum(tempRating[pid])/len(tempRating[pid]))
      tempTotal[pid] = 0
      tempRating[pid] = []
    tempTotal[pid] += 1
    tempRating[pid].append(rating)
    
  for pid in num_review:
    num_review[pid].reverse()
    timeline[pid] = timeline[pid][:-1]
    timeline[pid].reverse()
    ratingAvg[pid].reverse()
  
  # get maximum changes among different products
  posChange = {}
  for pid in num_review:
    if num_review[pid] == None:
      continue
    if len(num_review[pid]) < 5:
      continue
    posIdx = 0
    posMax = -10000
    for i in range(1,len(num_review[pid])):
      diff = num_review[pid][i] - num_review[pid][i-1]
      if diff > posMax:
        posMax = diff
        posIdx = i-1
    posChange[pid] = posIdx

  # get relationship between average rating change and number of review change
  before = [0 for _ in range(5)]
  after = [0 for _ in range(5)]
  for pid in posChange:
    curr_before = sum(ratingAvg[pid][:posChange[pid]+1])
    curr_after = sum(ratingAvg[pid][posChange[pid]+1:])
    beforeIdx = mapRating(curr_before)
    afterIdx = mapRating(curr_after)
    before[beforeIdx-1] += 1
    after[afterIdx-1] += 1
  
  return np.array(before), np.array(after)

In [None]:
## part d execution
plt.figure(figsize=(30,10))
plt.rcParams.update({'font.size': 25})
counter = 1
ratings = np.arange(1,6)
for key in df_all:
  before, after = ratingTrend(df_all[key])
  plt.subplot('13'+str(counter))
  plt.bar(ratings-0.2, before, width=0.2, color='b', align='center')
  plt.bar(ratings, after, width=0.2, color='g', align='center')
  plt.bar(ratings+0.2, after-before, width=0.2, color='r', align='center')
  plt.legend(['before', 'after', 'difference'])
  plt.grid()
  plt.xlabel('average rating')
  plt.ylabel('number of brand')
  plt.title(key)
  counter += 1
plt.savefig('partD.png')
plt.close()

In [None]:
## part e functions
def isAdj(word):
  for tmp in wn.synsets(word):
    if tmp.pos() == 'a':
      return True
  return False

def descriptors(data):
  rateDict = {}
  for i in range(1,6):
    rateDict[i] = dict()
  
  for key in data.index:
    line = data['review_headline'][key]
    if type(line) is not str:
      continue
    words = tokenizer.tokenize(line.lower())
    rating = data['star_rating'][key]
    for word in words:
      if word in stop_words:
        continue
      if not isAdj(word):
        continue
      if word not in rateDict[rating]:
        rateDict[rating][word] = 0
      rateDict[rating][word] += 1

    line = data['review_body'][key]
    if type(line) is not str:
      continue
    words = tokenizer.tokenize(line.lower())
    rating = data['star_rating'][key]
    for word in words:
      if word in stop_words:
        continue
      if not isAdj(word):
        continue
      if word not in rateDict[rating]:
        rateDict[rating][word] = 0
      rateDict[rating][word] += 1
  
  return rateDict

In [None]:
## part e execution
rateDictHair = descriptors(df_hair)
rateDictMicrowave = descriptors(df_microwave)
rateDictPacifier = descriptors(df_pacifier)

freqs = []
alltypes = {'hair_dryer':rateDictHair, 'microwave':rateDictMicrowave, 'pacifier':rateDictPacifier}
for key in alltypes:
  for rating in alltypes[key]:
    currDict = alltypes[key][rating]
    freq = [(k, currDict[k]) for k in sorted(currDict, key=currDict.get, reverse=True)[:50]]
    freqs.append([key]+[rating]+freq)
freqsPD = pd.DataFrame(freqs, columns = ['product', 'rating'] + ['wordFreq' for _ in range(50)])
freqsPD.to_csv('partE.csv') 