In [None]:
import gzip
from collections import defaultdict
from sklearn import svm
import numpy
import random
import string
from sklearn import linear_model
import pandas as pd
from numpy.random import normal
from sklearn.model_selection import GridSearchCV
import scipy
import tensorflow as tf
from fastFM import als
from scipy.spatial import distance
import json

In [None]:
FILEPATH = 'renttherunway_final_data.json.gz'
META = 'endomondoMeta.json.gz'

In [None]:
def readJSON(path):
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        f.readline()
        for line in f:
            d = json.loads(line)
            yield d

In [None]:
raw = []
for l in readJSON(FILEPATH):
    raw.append(l)

In [None]:
type(raw)

list

In [None]:
raw_dict = [item for item in raw]

df = pd.DataFrame(raw_dict)

In [None]:
df = df.drop(["review_text", "review_summary", "review_date"], axis = 1)

In [None]:
na_value = df.isna()
na_sum = na_value.sum(axis = 0)
na_sum

fit               0
user_id           0
bust size     18411
item_id           0
weight        29982
rating           82
rented for       10
body type     14637
category          0
height          677
size              0
age             960
dtype: int64

In [None]:
# for categorical data, use mode to replace the null value
bustsize_mode = df['bust size'].mode()[0]
bodytype_mode = df['body type'].mode()[0]
df['bust size'].fillna(bustsize_mode, inplace=True)
df['body type'].fillna(bodytype_mode, inplace=True)

In [None]:
# make sure weight column has same unit
df["weight"].str.extract('\d+(...)').value_counts()

lbs    162561
dtype: int64

In [None]:
# for numerical data, use global mean to replace the null value
df["weight"] = df['weight'].str.extract('(\d+)').astype(float)
weight_mean = df["weight"].mean()
df["weight"].fillna(weight_mean, inplace=True)

df["age"] = df["age"].astype(float)
age_mean = df["age"].mean()
df["age"].fillna(age_mean, inplace=True)

In [None]:
def feet_inches_to_cm(s):
    # Check if 's' is a string
    if isinstance(s, str):
        parts = s.split("'")
        feet = int(parts[0])
        inches = int(parts[1].replace('"', '').strip())
        return (feet * 30.48) + (inches * 2.54)
    else: # if 's' is not a string
        return None

In [None]:
df['height_cm'] = df['height'].apply(feet_inches_to_cm)

df["height_cm"] = df["height_cm"].astype(float)
height_mean = df["height_cm"].mean()
df["height_cm"].fillna(height_mean, inplace=True)

In [None]:
df = df.drop("height", axis = 1)

In [None]:
na_value = df.isna()
na_sum = na_value.sum(axis = 0)
na_sum

fit            0
user_id        0
bust size      0
item_id        0
weight         0
rating        82
rented for    10
body type      0
category       0
size           0
age            0
height_cm      0
dtype: int64

In [None]:
data = df.dropna()

In [None]:
data["fit"] = data["fit"].astype("category")
data["bust size"] = data["bust size"].astype("category")
data["rating"] = data["rating"].astype(float)
data["rented for"] = data["rented for"].astype("category")
data["body type"] = data["body type"].astype("category")
data["category"] = data["category"].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["fit"] = data["fit"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["bust size"] = data["bust size"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["rating"] = data["rating"].astype(float)
A value is trying to be set on a copy of a slice from a D

In [None]:
data = data.to_dict(orient='records')

In [None]:
data[0]

{'fit': 'fit',
 'user_id': '273551',
 'bust size': '34b',
 'item_id': '153475',
 'weight': 132.0,
 'rating': 10.0,
 'rented for': 'other',
 'body type': 'straight & narrow',
 'category': 'gown',
 'size': 12,
 'age': 36.0,
 'height_cm': 167.64000000000001}

In [None]:
random.shuffle(data)

In [None]:
userIDs,itemIDs = {},{}

for d in data:
    u,i = d['user_id'],d['item_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

nUsers,nItems = len(userIDs),len(itemIDs)

In [None]:
nUsers,nItems

(105504, 5850)

In [None]:
X = scipy.sparse.lil_matrix((len(data), nUsers + nItems))

In [None]:
for i in range(len(data)):
    user = userIDs[data[i]['user_id']]
    item = itemIDs[data[i]['item_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item

In [None]:
y = numpy.array([d['rating'] for d in data])


In [None]:
fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)

In [None]:
X.shape

(192451, 111354)

In [None]:
X_train,y_train = X[:int(X.shape[0]*0.75)], y[:int(X.shape[0]*0.75)]
X_test,y_test = X[int(X.shape[0]*0.75):], y[int(X.shape[0]*0.75):]

In [None]:
fm.fit(X_train, y_train)

In [None]:
y_pred = fm.predict(X_test)

In [None]:
y_pred[:10]

array([ 8.8681674 , 10.45177521,  9.08202903,  8.33118282,  9.16214453,
        9.3724546 ,  9.49521026,  9.64558977,  9.35608701, 10.16986752])

In [None]:
y_test[:10]

array([10., 10., 10.,  6., 10., 10., 10., 10.,  8., 10.])

In [None]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [None]:
MSE(y_pred, y_test)

2.5266488111047165

In [None]:
test = [10] * len(y_pred)

In [None]:
MSE(test, y_test)

2.8218568785983