In [1]:
import urllib.request 
import pandas as pd
import numpy as np
import gzip
import json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None

file_name = "renttherunway_final_data.json.gz"
urllib.request.urlretrieve("http://jmcauley.ucsd.edu/data/renttherunway/renttherunway_final_data.json.gz", file_name)

# function to parse file from original site listed above

def parse(file):
  file = gzip.open(file, "r")
  for entry in file:
    yield eval(entry)
    
# Setting null name for generator to list conversion- throws error otherwise

null = "Unknown"
fashion_dict_list = list(parse(file_name))
fashion_data = pd.DataFrame(fashion_dict_list).replace("Unknown", np.nan)

print("Rent-the-Runway Data Specs \n")
print("Rows (unique transactions): {row:,}".format(row=fashion_data.shape[0]))
print("Columns: {0}".format(fashion_data.shape[1]))

fashion_data.head()

Rent-the-Runway Data Specs 

Rows (unique transactions): 192,544
Columns: 15


Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36,"June 18, 2013"
2,fit,360448,,1063761,,10,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27,"September 26, 2016"


In [2]:
fashion_data.columns

Index(['fit', 'user_id', 'bust size', 'item_id', 'weight', 'rating',
       'rented for', 'review_text', 'body type', 'review_summary', 'category',
       'height', 'size', 'age', 'review_date'],
      dtype='object')

In [3]:
print("Number of unique users: {user:,}".format(user=fashion_data["user_id"].nunique()))
print("Number of unique items: {item:,}".format(item=fashion_data["item_id"].nunique()))

Number of unique users: 105,571
Number of unique items: 5,850


After seeing the first few rows and reviewing the missing data here, it's clear some variables require cleaning.

In [4]:
pd.DataFrame({"is_null_count": fashion_data.isna().sum()}) 

Unnamed: 0,is_null_count
fit,0
user_id,0
bust size,18411
item_id,0
weight,29982
rating,82
rented for,10
review_text,0
body type,14637
review_summary,0


In [5]:
fashion_data.shape

(192544, 15)

## Data Cleaning and Exploration

### Cleaning approach

Cleaning steps here include:
- Adding underscores where white space exists in column names
- Dropping records without a rating since these are required for the analysis
- Converting height column from string to numeric in inches
- Variable type conversions (e.g. to numeric)
- Imputations (often with a median, or grouped median)
- Age values: Age showed values over 100 and 0- assumption that these are errors and median is imputed
- Cleaning text fields by removing non-alphanumeric characters
- Converting rating scale to 1-5: There were only even numbers (i.e. 2, 4, 6, 8, 10) so condensing it to this scale seems reasonable

The final logic shows that following these steps, all the missing values have been addressed.

In [6]:
# fashion_data.columns = fashion_data.columns.str.replace(" ", "_")

fashion_data.dropna(subset = ["rating"], inplace=True)

In [8]:
dataset = fashion_data[['user_id', 'item_id', 'rating']]
dataset['rating'] = pd.to_numeric(dataset['rating'])
dataset['rating'] = dataset['rating'] / 2 # 1-5 scale

In [13]:
len(dataset['user_id'].unique()), len(dataset['item_id'].unique())

(105508, 5850)

In [19]:
dataset.to_csv('rentrunway_preproc.csv',index=False)