### Data Cleaning

In [1]:
import numpy as np
import json
import pandas as pd

import warnings

warnings.filterwarnings("ignore")

import urllib
import scipy.optimize
import random
from collections import defaultdict  # Dictionaries with default values
from math import log10
import string
from nltk.stem.porter import *
from nltk import bigrams
from sklearn import linear_model
import ast

In [2]:
# read the data
path = "renttherunway_final_data.json"
f = open(path, "r")

dataset_ = []
for i in f:
    dataset_.append(json.loads(i))

# build dataset dataframe
dataset = pd.DataFrame()
dataset = dataset.append(dataset_, ignore_index=True)
dataset = dataset.drop(["bust size", "weight", "body type", "height", "size"], axis=1)

In [3]:
# Since some review data do not have age features, we calculate the average age from the remaining data
age = []
for i in dataset_:
    if "age" in i.keys():
        age.append(int(i["age"]))
avgAge = sum(age) / len(age)

dataset[["age"]] = dataset[["age"]].fillna(avgAge)
dataset[["age"]] = dataset[["age"]].astype(int)

In [4]:
# Since some data does not have rating info (82), we remove these data
for i in range(len(dataset["rating"])):
    if dataset["rating"][i] == None:
        dataset = dataset.drop(i)
dataset[["rating"]] = dataset[["rating"]].astype(int)
dataset["rating"] = dataset["rating"].map(lambda x: x / 2)

In [5]:
# segment data as positive and negative
def partition(x):
    if x <= 3:
        return "negative"
    return "positive"


# actualScore = filtered_data['Score']
# positiveNegative = actualScore.map(partition)
dataset["P/N"] = dataset["rating"].map(partition)

### shuffle the data and split data

In [6]:
from sklearn.utils import shuffle

dataset = shuffle(dataset)
random.seed(1234)

Ntrain, Nvalid, Ntest = (
    round(len(dataset) * 0.8),
    round(len(dataset) * 0.1),
    round(len(dataset) * 0.1),
)
# split the data
data_train = dataset[:Ntrain]
data_valid = dataset[Ntrain : Nvalid + Ntrain]
data_test = dataset[Nvalid + Ntrain :]

### naive classifier 
f (t) = α The main idea for this classifier is that we generates predictions by computing averages for each user, or return the global average if we've never seen the user before

In [7]:
# calculate global average and each user average in training data
y_train = data_train[["rating"]]
globalAverage = y_train.mean()

UserAvg = data_train.groupby("user_id")["rating"].mean()

In [8]:
# define mse function
def MSE(pred, data):
    differences = [(x - y) ** 2 for x, y in zip(pred, data)]
    mse = sum(differences) / len(differences)
    return mse

In [9]:
# define predictor
def predict(user_id):
    if user_id in UserAvg.keys():
        pred = UserAvg[user_id]
    else:
        pred = globalAverage
    return pred

In [10]:
# test on validation set
y_valid = data_valid["rating"]
y_pred_valid = [predict(i) for i in data_valid["user_id"]]
MSE_valid = MSE(y_pred_valid, y_valid)
print("MSE on validation set is", str(MSE_valid))

MSE on validation set is rating    0.614557
dtype: float64


In [11]:
# test on testing set
y_test = data_test["rating"]
y_pred_test = [predict(i) for i in data_test["user_id"]]
MSE_test = MSE(y_pred_test, y_test)
print("MSE on testing set is", str(MSE_test))

MSE on testing set is rating    0.602288
dtype: float64


### Similarity prediction
using  similarity 

In [12]:
# allRatings = []
# userRatings = defaultdict(list)

# for user,book,r in readCSV("train_Interactions.csv.gz"):
#   r = int(r)
#   allRatings.append(r)
#   userRatings[user].append(r)

# globalAverage = sum(allRatings) / len(allRatings)
# userAverage = {}
# for u in userRatings:
#   userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

In [13]:
dataset["P/N"].value_counts().plot(kind="bar")

<matplotlib.axes._subplots.AxesSubplot at 0x7f9f042946d8>

In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import bigrams

nltk.download("stopwords")

# build text cleaning funcion
punct = string.punctuation
stemmer = PorterStemmer()
stop = set(stopwords.words("english"))  # set of stopwords


def cleanText(text):
    t = text.lower()  # lowercase string
    t = [c for c in t if not (c in punct)]  # non-punct characters
    t = [c for c in t if not (c.isdigit())]
    t = "".join(t)  # convert back to string
    t = t.strip().split()  # tokenizes
    words = [c for c in t if not (c in stop)]  # remove stopwords
    return words

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Linear regression with bag of words

#### C.1. Unigram

In [15]:
# List the 10 most-frequently-occurring unigrams along with their number of occurrences in the corpus
# build unigramCount dictionary
unigramCount = defaultdict(int)
for d in data_train["review_text"]:
    words = cleanText(d)
    for w in words:
        unigramCount[w] += 1

In [16]:
unigramCounts = [(unigramCount[w], w) for w in unigramCount]
unigramCounts.sort()
unigramCounts.reverse()

In [17]:
print("Top 10 words for unigram."), unigramCounts[:10]

Top 10 words for unigram.


(None,
 [(204844, 'dress'),
  (74597, 'fit'),
  (62852, 'size'),
  (54231, 'would'),
  (50345, 'wear'),
  (46439, 'wore'),
  (44208, 'great'),
  (44073, 'little'),
  (41767, 'perfect'),
  (38196, 'comfortable')])

#### C.2. bigram

In [18]:
# List the 10 most-frequently-occurring bigrams along with their number of occurrences in the corpus
# build bigramCount dictionary

bigramCount = defaultdict(int)
for d in data_train["review_text"]:
    words = cleanText(d)
    for w in range(len(words) - 1):
        bigram = words[w] + " " + words[w + 1]
        bigramCount[bigram] += 1

bigramCounts = [(bigramCount[w], w) for w in bigramCount]
bigramCounts.sort()
bigramCounts.reverse()

In [19]:
print("Top 10 words for bigram."), bigramCounts[:10]

Top 10 words for bigram.


(None,
 [(13777, 'true size'),
  (12487, 'many compliments'),
  (11052, 'loved dress'),
  (9613, 'dress fit'),
  (9086, 'fit perfectly'),
  (7054, 'would definitely'),
  (6313, 'received many'),
  (6055, 'wore dress'),
  (5494, 'dress perfect'),
  (5466, 'great dress')])

#### Using the 2000 unigrams to train a regressor

In [20]:
# Using the 1000 unigrams to train a regressor
uniwords = [x[1] for x in unigramCounts[:2000]]
uniwordId = dict(zip(uniwords, range(len(uniwords))))
uniwordSet = set(uniwords)


def feature_uni(datum):
    feat = [0] * len(uniwordSet)
    words = cleanText(datum)
    for w in words:
        if not (w in uniwordSet):
            continue
        feat[uniwordId[w]] += 1
    feat.append(1)
    return feat

In [21]:
X_train = [feature_uni(d) for d in data_train["review_text"]]
y_train = [d for d in data_train["rating"]]
X_valid = [feature_uni(d) for d in data_valid["review_text"]]
y_valid = [d for d in data_valid["rating"]]
X_test = [feature_uni(d) for d in data_test["review_text"]]
y_test = [d for d in data_test["rating"]]

In [22]:
# train regression model
from sklearn import linear_model

clf = linear_model.Ridge(2.0, fit_intercept=False)  # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [23]:
theta[:20]

array([-0.02629944,  0.03034688,  0.01027751, -0.03141446, -0.02492395,
        0.04781551,  0.08056337,  0.03427384,  0.14210758,  0.0980875 ,
        0.12559584,  0.1195039 , -0.04282727,  0.00641369,  0.00547121,
        0.00032832,  0.05089087,  0.05811224, -0.06673966, -0.02427626])

In [24]:
weights = list(zip(theta, uniwords + ["constant_feat"]))
weights.sort()
print("Top 10 positive Coefficient ", weights[-10:])
print("Top 10 negative coefficient ", weights[:10])

Top 10 positive Coefficient  [(0.171619349540983, 'buying'), (0.17956885248039733, 'nicole'), (0.17974411851324626, 'glove'), (0.18315833765492484, 'flatters'), (0.19171917352113724, 'adore'), (0.19247067246109556, 'perfection'), (0.1936580392940384, 'dream'), (0.20662049199311994, 'movie'), (0.23952263966875728, 'happier'), (4.4396186177602575, 'constant_feat')]
Top 10 negative coefficient  [(-0.7750650090470079, 'disappointing'), (-0.5770152143900348, 'unflattering'), (-0.36766105290605716, 'unable'), (-0.36029858990415753, 'frumpy'), (-0.35712876231527463, 'disappointed'), (-0.3570379010055114, 'odd'), (-0.3478392448445271, 'cheap'), (-0.34737844869288337, 'awkwardly'), (-0.3468383231176177, 'strange'), (-0.34456867352735676, 'bulky')]


In [25]:
# predict on validation set and testing set
y_pred_valid = clf.predict(X_valid)
y_pred_test = clf.predict(X_test)

In [26]:
# calculate MSE for unigrams regressor
from sklearn.metrics import mean_squared_error

print("MSE on validation set:", mean_squared_error(y_valid, y_pred_valid))
print("MSE on testing set:", mean_squared_error(y_test, y_pred_test))

MSE on validation set: 0.38447959854428265
MSE on validation set: 0.3613198470884725


#### Using the 1000 bigrams to train a regressor

In [27]:
# Using the 1000 bigrams to train a regressor
biwords = [x[1] for x in bigramCounts[:2000]]
biwordId = dict(zip(biwords, range(len(biwords))))
biwordSet = set(biwords)


def feature_bi(datum):
    feat = [0] * len(biwordSet)
    words = cleanText(datum)
    for i in range(len(words) - 1):
        bigram = words[i] + " " + words[i + 1]
        if bigram in biwords:
            feat[biwordId[bigram]] += 1
    feat.append(1)
    return feat

In [28]:
X_train = [feature_bi(d) for d in data_train["review_text"]]
y_train = [d for d in data_train["rating"]]
X_valid = [feature_bi(d) for d in data_valid["review_text"]]
y_valid = [d for d in data_valid["rating"]]
X_test = [feature_bi(d) for d in data_test["review_text"]]
y_test = [d for d in data_test["rating"]]

In [29]:
clf = linear_model.Ridge(1.0, fit_intercept=False)  # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [30]:
theta[:20]

array([ 0.09440102,  0.18640503,  0.12545135, -0.02535028,  0.19624789,
       -0.01453997, -0.04011656,  0.0553807 ,  0.09942871,  0.051852  ,
        0.12848685, -0.07776247,  0.01601269, -0.02314007,  0.04580174,
        0.19965921,  0.20557228, -0.04552579,  0.22480352, -0.0381269 ])

In [31]:
weights = list(zip(theta, biwords + ["constant_feat"]))
weights.sort()
print("Top 10 positive Coefficient ", weights[-10:])
print("Top 10 negative coefficient ", weights[:10])

Top 10 positive Coefficient  [(0.2257736257475605, 'loved much'), (0.22662000347304684, 'want buy'), (0.2336798109342331, 'like million'), (0.23794805613231781, 'fits perfectly'), (0.23879455637804056, 'like princess'), (0.24258767973148854, 'wish owned'), (0.2606347820470421, 'say enough'), (0.2909078665944556, 'like dream'), (0.31645701834340617, 'wanted keep'), (4.4465064025062935, 'constant_feat')]
Top 10 negative coefficient  [(-0.7416800999840821, 'unable wear'), (-0.7223609048190053, 'without wearing'), (-0.7103472524314987, 'wouldnt rent'), (-0.671610200650331, 'wasnt flattering'), (-0.549616788207872, 'way short'), (-0.5318084862915318, 'couldnt even'), (-0.5288910685092774, 'didnt work'), (-0.5277394190576751, 'end wearing'), (-0.515269552202804, 'gave stars'), (-0.4966872483622018, 'could barely')]


In [32]:
# predict on validation set and testing set
y_pred_valid = clf.predict(X_valid)
y_pred_test = clf.predict(X_test)

In [33]:
# calculate MSE for bigrams regressor
from sklearn.metrics import mean_squared_error

print("MSE on validation set:", mean_squared_error(y_valid, y_pred_valid))
print("MSE on testing set:", mean_squared_error(y_test, y_pred_test))

MSE on validation set: 0.40698806056531217
MSE on validation set: 0.3892037389634508


#### using the 1000 unigrams and bigrams to train a regressor

In [34]:
# combine unigrams and bigrams
mergeCount = unigramCounts + bigramCounts
mergeCount.sort()
mergeCount.reverse()
grams = [x[1] for x in mergeCount[:2000]]
gramID = dict(zip(grams, range(len(grams))))
gramSet = set(grams)

In [35]:
def feature_mer(datum):
    feat = [0] * len(gramSet)
    text = cleanText(datum)
    bg = list(bigrams(text)) # all bigrams in text
    for w in text + bg:
        if w in gramSet:
            feat[gramID[w]] += 1
    feat.append(1) #offset
    return feat

In [36]:
X_train = [feature_mer(d) for d in data_train["review_text"]]
y_train = [d for d in data_train["rating"]]
X_valid = [feature_mer(d) for d in data_valid["review_text"]]
y_valid = [d for d in data_valid["rating"]]
X_test = [feature_mer(d) for d in data_test["review_text"]]
y_test = [d for d in data_test["rating"]]

In [37]:
clf = linear_model.Ridge(0.05, fit_intercept=False)  # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [38]:
theta[:20]

array([-0.02608017,  0.02984921,  0.00943985, -0.03253671, -0.02499468,
        0.05010504,  0.08144777,  0.03229155,  0.14493623,  0.10016506,
        0.13220991,  0.11999883, -0.04187307,  0.00454936,  0.00491307,
       -0.00134528,  0.05132664,  0.06120607, -0.0718312 , -0.02708323])

In [39]:
weights = list(zip(theta, grams + ["constant_feat"]))
weights.sort()
print("Top 10 positive Coefficient ", weights[-10:])
print("Top 10 negative coefficient ", weights[:10])

Top 10 positive Coefficient  [(0.16121282805028816, 'deal'), (0.16191371058230716, 'heartbeat'), (0.16328434332329486, 'holds'), (0.17314485054931322, 'buying'), (0.175122374441242, 'incredible'), (0.1764951679980558, 'princess'), (0.18390212555850283, 'glove'), (0.1976967621532812, 'dream'), (0.24385331527923879, 'happier'), (4.436817089192557, 'constant_feat')]
Top 10 negative coefficient  [(-0.5870434504405685, 'unflattering'), (-0.3879822541865254, 'unable'), (-0.3739239752013788, 'disappointed'), (-0.3667054335084634, 'odd'), (-0.36529467680870736, 'strange'), (-0.35514869356708845, 'cheap'), (-0.35236965653282926, 'awkward'), (-0.35103662323635587, 'unfortunately'), (-0.3374737853655394, 'bulky'), (-0.33323016019839613, 'returned')]


In [40]:
# predict on validation set and testing set
y_pred_valid = clf.predict(X_valid)
y_pred_test = clf.predict(X_test)

In [41]:
# calculate MSE for unigrams and bigrams regressor
from sklearn.metrics import mean_squared_error

print("MSE on validation set:", mean_squared_error(y_valid, y_pred_valid))
print("MSE on testing set:", mean_squared_error(y_test, y_pred_test))

MSE on validation set: 0.38856938953442993
MSE on validation set: 0.3648699201505146


In [None]:
MSE on validation set: 0.37746878280012636
MSE on validation set: 0.39209168978072867

### Linear regression with tf-idf
Firstly we extract every comment from data. Then calculate tf − idf for every segment in comment. After that, we use the logistic regression to assgin weight(coefficient) to words. The larger the coefficient, the better the rating.

In [42]:
wordCount = defaultdict(int)
for d in data_train['review_text']:   
    words = cleanText(d)
    for word in words:
        wordCount[word] += 1

In [43]:
# define function tfidf
from math import log
def tf(w,d):
    count = 0
    text = cleanText(d)
    for t in text:
        if t == w:
            count += 1
    return count/len(text)

def idf(w,dataset):
    return log(len(dataset) / wordCount[w]*1.0, 10)

def tfidf(w,d,dataset):
    return tf(w,d) * idf(w,dataset)  

##### regression model with tfidf on unigrams

In [44]:
# Using the 1000 unigrams to train a regressor
uniwords = [x[1] for x in unigramCounts[:1000]]
uniwordId = dict(zip(uniwords, range(len(uniwords))))
uniwordSet = set(uniwords)

In [45]:
# define features
def feature_tfidf_uni(datum,dataset):
    feat = [0]*len(uniwordSet)
    text = cleanText(datum)
    for w in text:
        if w in uniwordSet:
            feat[uniwordId[w]] = tfidf(w,datum,dataset)
    feat.append(1)
    return feat

In [46]:
X_train = [feature_tfidf_uni(d,data_train) for d in data_train['review_text']]
y_train = [d for d in data_train["rating"]]
X_valid = [feature_tfidf_uni(d,data_valid) for d in data_valid['review_text']]
y_valid = [d for d in data_valid["rating"]]
X_test = [feature_tfidf_uni(d,data_test) for d in data_test['review_text']]
y_test = [d for d in data_test["rating"]]

In [47]:
clf = linear_model.Ridge(1, fit_intercept=False)  # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [48]:
theta[:20]

array([ 2.20007834,  0.15306657,  0.43383712, -1.66589775, -1.39375688,
        2.52257482,  2.02717433,  2.09942877,  2.71430203,  1.43278419,
        2.51535421,  1.8951484 , -1.90969393,  0.57134874,  0.52819009,
        0.64319079,  1.74354214,  2.72208036, -2.82347704, -0.66407166])

In [49]:
weights = list(zip(theta, uniwords + ["constant_feat"]))
weights.sort()
print("Top 10 positive Coefficient ", weights[-10:])
print("Top 10 negative coefficient ", weights[:10])

Top 10 positive Coefficient  [(2.099428770211516, 'little'), (2.107478356441701, 'nervous'), (2.2000783383951505, 'dress'), (2.4392897275003604, 'glove'), (2.5153542132586435, 'compliments'), (2.5225748236190912, 'wore'), (2.7143020326307563, 'perfect'), (2.7220803614821, 'bra'), (2.8482866689187465, 'worried'), (4.416813328340744, 'constant_feat')]
Top 10 negative coefficient  [(-4.814897000221017, 'unfortunately'), (-4.568198223141438, 'disappointed'), (-3.7437326248659515, 'unflattering'), (-3.6722893456438928, 'awkward'), (-3.64893690709843, 'excited'), (-3.4393210262893135, 'odd'), (-3.071351735108077, 'weird'), (-3.046249178070285, 'however'), (-2.846227354596591, 'returned'), (-2.823477037508764, 'didnt')]


In [50]:
# predict on validation set and testing set
y_pred_valid = clf.predict(X_valid)
y_pred_test = clf.predict(X_test)

In [51]:
# calculate MSE for unigrams and bigrams regressor
from sklearn.metrics import mean_squared_error

print("MSE on validation set:", mean_squared_error(y_valid, y_pred_valid))
print("MSE on testing set:", mean_squared_error(y_test, y_pred_test))

MSE on validation set: 0.5612942964050983
MSE on validation set: 0.5467620872967909


##### regression model with tfidf on bigrams

In [52]:
# Using the 1000 bigrams to train a regressor
biwords = [x[1] for x in bigramCounts[:1000]]
biwordId = dict(zip(biwords, range(len(biwords))))
biwordSet = set(biwords)

In [53]:
#define tfidf
def tf_bi(w,d):
    count = 0
    text = cleanText(d)
    for i in range(len(text)-1):
        bigram = text[i] + " " + text[i+1]
        if bigram == w:
            count += 1
    return count/int(len(text)/2)

def idf_bi(w,dataset):
    return log(len(dataset) / wordCount[w]*1.0, 10)

def tfidf(w,d,dataset):
    return tf(w,d) * idf(w,dataset)  

In [54]:
# define features
def feature_tfidf_bi(datum,dataset):
    feat = [0]*len(biwordSet)
    text = cleanText(datum)
    for i in range(len(text)-1):
        bi = text[i] + " " + text[i+1]
        if bi in biwords:
            feat[biwordId[w]] = tfidf(bi,datum,dataset)
    feat.append(1)
    return feat

In [None]:
X_train = [feature_tfidf_bi(d,data_train) for d in data_train['review_text']]
y_train = [d for d in data_train["rating"]]
X_valid = [feature_tfidf_bi(d,data_valid) for d in data_valid['review_text']]
y_valid = [d for d in data_valid["rating"]]
X_test = [feature_tfidf_bi(d,data_test) for d in data_test['review_text']]
y_test = [d for d in data_test["rating"]]

In [None]:
clf = linear_model.Ridge(1, fit_intercept=False)  # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_

In [None]:
# convert dataframe to dictionary
train_dict = data_train.to_dict(orient="records")