# Opening Files

In [148]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd

In [149]:
# Function to process a chunk of data
def process_data(chunk, columns=None):
    # If columns is not None, keep only those columns
    if columns is not None:
        chunk = chunk[columns]
    return chunk

# Function to read data in chunks and process each chunk
def load_data(file_name, head = None, columns=None, chunksize = 1000):
    chunks = []
    count = 0
    with gzip.open(file_name) as fin:
        for chunk in pd.read_json(fin, lines=True, chunksize=chunksize):
            # Process the chunk
            processed_chunk = process_data(chunk, columns)
            chunks.append(processed_chunk)
            
            count += 1
            # break if reaches the head-th chunk
            if (head is not None) and (count > head):
                break

    # Combine all chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)
    
    return df

In [150]:
DIR = 'C:\\Users\\jesse\\Documents\\GitHub\\vanity_sizing_project\\data\\'

In [151]:
mod = load_data(DIR+'modcloth_final_data.json.gz')
mod.head(10)

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,
5,123373,27.0,11,5.0,c,41.0,36.0,new,,5ft 4in,Doreenajane,just right,small,162012,,,,
6,123373,26.0,5,1.0,b,,32.0,new,,5ft 3in,barbiejenks,just right,large,114843,,,,
7,123373,,11,5.0,d,42.0,38.0,new,,5ft 5in,brettloie,just right,small,58869,8.5,,,
8,123373,,30,4.0,d,50.0,42.0,new,,5ft 10in,francescaviola,just right,small,279568,11.0,wide,,
9,123373,,13,5.0,dd/e,41.0,36.0,new,39.0,5ft 6in,laurenpolzin,just right,fit,950172,9.0,,,


In [152]:
rtr = load_data(DIR+'renttherunway_final_data.json.gz')
rtr.head(10)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
5,fit,734848,32b,364092,138lbs,8.0,date,Didn't actually wear it. It fit perfectly. The...,athletic,Traditional with a touch a sass,dress,"5' 8""",8,45.0,"April 30, 2016"
6,fit,336066,34c,568429,112lbs,10.0,everyday,This dress is so sweet. I loved the print. The...,hourglass,LITERALLY THE CUTEST DRESS EVER,dress,"5' 3""",4,27.0,"December 7, 2017"
7,fit,86661,34d+,130259,118lbs,10.0,formal affair,Fit was great. Maybe a little tight under the ...,full bust,"Great dress, beautifully made. I received lot...",dress,"5' 3""",8,65.0,"January 1, 2013"
8,fit,166228,36d,1729232,,10.0,formal affair,I was nervous of it looking cheap when it arri...,full bust,Great for black tie event!,gown,"5' 6""",21,27.0,"June 27, 2016"
9,fit,154309,32b,1729232,114lbs,10.0,formal affair,The dress was very flattering and fit perfectl...,petite,This dress was everything! It was perfect for ...,gown,"5' 3""",1,33.0,"October 17, 2016"


# Rent The Runway

## General Overview

In [153]:
rtr.describe()

Unnamed: 0,user_id,item_id,rating,size,age
count,192544.0,192544.0,192462.0,192544.0,191584.0
mean,499494.100149,1045684.0,9.092371,12.245175,33.871017
std,289059.719328,805314.8,1.430044,8.494877,8.058083
min,9.0,123373.0,2.0,0.0,0.0
25%,250654.25,195076.0,8.0,8.0,29.0
50%,499419.0,948396.0,10.0,12.0,32.0
75%,750974.0,1678888.0,10.0,16.0,37.0
max,999997.0,2966087.0,10.0,58.0,117.0


In [154]:
rtr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 15 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   fit             192544 non-null  object 
 1   user_id         192544 non-null  int64  
 2   bust size       174133 non-null  object 
 3   item_id         192544 non-null  int64  
 4   weight          162562 non-null  object 
 5   rating          192462 non-null  float64
 6   rented for      192534 non-null  object 
 7   review_text     192544 non-null  object 
 8   body type       177907 non-null  object 
 9   review_summary  192544 non-null  object 
 10  category        192544 non-null  object 
 11  height          191867 non-null  object 
 12  size            192544 non-null  int64  
 13  age             191584 non-null  float64
 14  review_date     192544 non-null  object 
dtypes: float64(2), int64(3), object(10)
memory usage: 22.0+ MB


In [155]:
#what is the most commmon weight - height - bust size combination
# Create a new column that combines weight, height, and bust size
rtr['summary'] = rtr['weight'].astype(str) + '-' + rtr['height'].astype(str) + '-' + rtr['bust size'].astype(str) + '-' + rtr['body type'].astype(str) + '-' + rtr['age'].astype(str)

# Find the most common combination excluding when weight, height, or bust size is missing
rtr['summary'].value_counts().head(10)

summary
125lbs-5' 3"-34c-full bust-31.0            437
122lbs-5' 7"-34b-straight & narrow-26.0    293
nan-5' 5"-34ddd/e-full bust-23.0           228
130lbs-5' 7"-34b-pear-26.0                 145
130lbs-5' 9"-34c-athletic-34.0             133
110lbs-5' 6"-32a-straight & narrow-24.0    124
125lbs-5' 2"-34d-full bust-32.0            105
nan-5' 4"-34b-hourglass-26.0               103
135lbs-5' 6"-34dd-hourglass-45.0            96
nan-5' 4"-nan-nan-23.0                      87
Name: count, dtype: int64

## General Cleaning

### Dropping duplicates

In [156]:
print(rtr.duplicated().sum())
rtr = rtr.drop_duplicates()

189


### Changing weight to numercial value

In [157]:
print(rtr['weight'].head(10))

0    137lbs
1    132lbs
2       NaN
3    135lbs
4    145lbs
5    138lbs
6    112lbs
7    118lbs
8       NaN
9    114lbs
Name: weight, dtype: object


In [158]:
rtr['weight'] = rtr['weight'].str.replace('lbs', '')
rtr['weight'] = pd.to_numeric(rtr['weight'], errors='raise')

### Changing height to numerical value -- to inches

In [159]:
print(rtr['height'].head(10))

0    5' 8"
1    5' 6"
2    5' 4"
3    5' 5"
4    5' 9"
5    5' 8"
6    5' 3"
7    5' 3"
8    5' 6"
9    5' 3"
Name: height, dtype: object


In [160]:
def convert_to_inches(height):
    height_parts = height.split("'")
    feet = int(height_parts[0].strip())
    inches = int(height_parts[1].strip().replace('"', ''))
    total_inches = (feet * 12) + inches
    return total_inches

In [161]:
rtr.loc[rtr['height'].notnull(), 'height'] = rtr.loc[rtr['height'].notnull(), 'height'].apply(convert_to_inches)
rtr['height'].head(10)

0    68
1    66
2    64
3    65
4    69
5    68
6    63
7    63
8    66
9    63
Name: height, dtype: object

### Restructuring Bust Size

In [162]:
rtr['bust size'].value_counts()

bust size
34b    27255
34c    23117
34d    18013
36c    13502
32d    11188
       ...  
28h        1
46f        1
42j        1
28i        1
44h        1
Name: count, Length: 106, dtype: int64

In [163]:
#split the bust size into band and cup
rtr['band_size'] = rtr['bust size'].str.extract('(\d+)').astype(float)
rtr['band_size'].value_counts()

band_size
34.0    81968
32.0    41122
36.0    39485
38.0     9538
30.0      832
40.0      450
28.0      285
42.0      193
44.0       77
46.0        8
48.0        5
Name: count, dtype: int64

In [164]:
rtr['cup_size'] = rtr['bust size'].str.extract('([A-Za-z]+\+?)')
rtr['cup_size'].value_counts()

cup_size
c      49586
b      49067
d      44967
a      15271
dd      6690
d+      3950
ddd     2598
aa       710
f        458
g        440
h        156
i         40
j         30
Name: count, dtype: int64

In [165]:
rtr = rtr.drop(columns='bust size')

### Changing data types where necessary

In [166]:
rtr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192355 entries, 0 to 192543
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   fit             192355 non-null  object 
 1   user_id         192355 non-null  int64  
 2   item_id         192355 non-null  int64  
 3   weight          162400 non-null  float64
 4   rating          192274 non-null  float64
 5   rented for      192345 non-null  object 
 6   review_text     192355 non-null  object 
 7   body type       177730 non-null  object 
 8   review_summary  192355 non-null  object 
 9   category        192355 non-null  object 
 10  height          191680 non-null  object 
 11  size            192355 non-null  int64  
 12  age             191395 non-null  float64
 13  review_date     192355 non-null  object 
 14  summary         192355 non-null  object 
 15  band_size       173963 non-null  float64
 16  cup_size        173963 non-null  object 
dtypes: float64(4), 

In [167]:
rtr['review_date'] = pd.to_datetime(rtr['review_date'])
rtr['review_date'].dt.year.value_counts()

review_date
2017    67798
2016    51298
2015    30550
2014    28266
2013     9617
2012     3362
2018     1254
2011      209
2010        1
Name: count, dtype: int64

#### Category coding

In [168]:
def to_category_w_dict(df, column):
    df[column] = df[column].astype('category')
    category_dict = {category: code for code, category in enumerate(df[column].cat.categories)}
    return category_dict

In [169]:
#apply the function to the category columns, and store the dictionaries
category_dicts = {}
for column in ['fit','rating','rented for', 'body type','category','size','cup_size','band_size','summary']:
    category_dicts[column] = to_category_w_dict(rtr, column)

rtr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 192355 entries, 0 to 192543
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   fit             192355 non-null  category      
 1   user_id         192355 non-null  int64         
 2   item_id         192355 non-null  int64         
 3   weight          162400 non-null  float64       
 4   rating          192274 non-null  category      
 5   rented for      192345 non-null  category      
 6   review_text     192355 non-null  object        
 7   body type       177730 non-null  category      
 8   review_summary  192355 non-null  object        
 9   category        192355 non-null  category      
 10  height          191680 non-null  object        
 11  size            192355 non-null  category      
 12  age             191395 non-null  float64       
 13  review_date     192355 non-null  datetime64[ns]
 14  summary         192355 non-null  category

In [170]:
category_dicts

{'fit': {'fit': 0, 'large': 1, 'small': 2},
 'rating': {2.0: 0, 4.0: 1, 6.0: 2, 8.0: 3, 10.0: 4},
 'rented for': {'date': 0,
  'everyday': 1,
  'formal affair': 2,
  'other': 3,
  'party': 4,
  'party: cocktail': 5,
  'vacation': 6,
  'wedding': 7,
  'work': 8},
 'body type': {'apple': 0,
  'athletic': 1,
  'full bust': 2,
  'hourglass': 3,
  'pear': 4,
  'petite': 5,
  'straight & narrow': 6},
 'category': {'ballgown': 0,
  'blazer': 1,
  'blouse': 2,
  'blouson': 3,
  'bomber': 4,
  'buttondown': 5,
  'caftan': 6,
  'cami': 7,
  'cape': 8,
  'cardigan': 9,
  'coat': 10,
  'combo': 11,
  'crewneck': 12,
  'culotte': 13,
  'culottes': 14,
  'down': 15,
  'dress': 16,
  'duster': 17,
  'for': 18,
  'frock': 19,
  'gown': 20,
  'henley': 21,
  'hoodie': 22,
  'jacket': 23,
  'jeans': 24,
  'jogger': 25,
  'jumpsuit': 26,
  'kaftan': 27,
  'kimono': 28,
  'knit': 29,
  'legging': 30,
  'leggings': 31,
  'maxi': 32,
  'midi': 33,
  'mini': 34,
  'overalls': 35,
  'overcoat': 36,
  'pant': 

# Modcloth

In [32]:
mod.describe()

Unnamed: 0,item_id,waist,size,quality,hips,bra size,user_id,shoe size
count,82790.0,2882.0,82790.0,82722.0,56064.0,76772.0,82790.0,27915.0
mean,469325.22917,31.319223,12.661602,3.949058,40.358501,35.972125,498849.564718,8.145818
std,213999.803314,5.302849,8.271952,0.992783,5.827166,3.224907,286356.969459,1.336109
min,123373.0,20.0,0.0,1.0,30.0,28.0,6.0,5.0
25%,314980.0,28.0,8.0,3.0,36.0,34.0,252897.75,7.0
50%,454030.0,30.0,12.0,4.0,39.0,36.0,497913.5,8.0
75%,658440.0,34.0,15.0,5.0,43.0,38.0,744745.25,9.0
max,807722.0,50.0,38.0,5.0,60.0,48.0,999972.0,38.0


In [33]:
mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         82790 non-null  int64  
 1   waist           2882 non-null   float64
 2   size            82790 non-null  int64  
 3   quality         82722 non-null  float64
 4   cup size        76535 non-null  object 
 5   hips            56064 non-null  float64
 6   bra size        76772 non-null  float64
 7   category        82790 non-null  object 
 8   bust            11854 non-null  object 
 9   height          81683 non-null  object 
 10  user_name       82790 non-null  object 
 11  length          82755 non-null  object 
 12  fit             82790 non-null  object 
 13  user_id         82790 non-null  int64  
 14  shoe size       27915 non-null  float64
 15  shoe width      18607 non-null  object 
 16  review_summary  76065 non-null  object 
 17  review_text     76065 non-null 