# Opening RTR Data

In [1]:
#read in pkl
import pickle
import numpy as np

with open('rtr_reviews_uncleaned.pkl', 'rb') as f:
    data = pickle.load(f)
data

#open category dict json
import json
with open('category_dict.json') as f:
    category_dict = json.load(f)
category_dict.head(5)

{'fit': {'fit': 0, 'large': 1, 'small': 2},
 'rating': {'2': 0, '4': 1, '6': 2, '8': 3, '10': 4},
 'rented for': {'date': 0,
  'everyday': 1,
  'formal affair': 2,
  'other': 3,
  'party': 4,
  'vacation': 5,
  'wedding': 6,
  'work': 7},
 'body type': {'apple': 0,
  'athletic': 1,
  'full bust': 2,
  'hourglass': 3,
  'pear': 4,
  'petite': 5,
  'straight & narrow': 6},
 'category': {'dress': 0,
  'jumpsuit': 1,
  'other': 2,
  'outerwear': 3,
  'pants': 4,
  'skirt': 5,
  'top': 6},
 'subcategory': {'blazer': 0,
  'blouse': 1,
  'coat': 2,
  'dress': 3,
  'gown': 4,
  'jacket': 5,
  'jumpsuit': 6,
  'leggings': 7,
  'maxi_dress': 8,
  'mini_dress': 9,
  'other': 10,
  'other_dress': 11,
  'other_outer': 12,
  'other_pants': 13,
  'pant': 14,
  'pants': 15,
  'romper': 16,
  'sheath_dress': 17,
  'shift_dress': 18,
  'shirt': 19,
  'shorts': 20,
  'skirt': 21,
  'suit': 22,
  'sweater': 23,
  'sweatshirt': 24,
  'tank': 25,
  'top': 26},
 'size': {'0': 0,
  '1': 1,
  '2': 2,
  '3': 3,

In [15]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
import re

[nltk_data] Downloading package punkt to /Users/camille/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
customer_reviews = data.review_text
customer_reviews

0         An adorable romper! Belt and zipper were a lit...
1         I rented this dress for a photo shoot. The the...
2         This hugged in all the right places! It was a ...
3         I rented this for my company's black tie award...
4         I have always been petite in my upper body and...
                                ...                        
192350                                    Fit like a glove!
192351    The pattern contrast on this dress is really s...
192352    Like the other DVF wraps, the fit on this is f...
192353    This dress was PERFECTION.  it looked incredib...
192354    This dress was wonderful! I had originally pla...
Name: review_text, Length: 192355, dtype: object

In [1]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Removing special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stopwords
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Preprocess the customer reviews
preprocessed_reviews = [preprocess_text(review) for review in customer_reviews]
preprocessed_reviews.head(10)

NameError: name 'customer_reviews' is not defined

In [23]:
# Vectorization using n-grams
vectorizer = CountVectorizer(ngram_range=(2,4), max_features=1000)
fit = vectorizer.fit_transform(preprocessed_reviews)

In [24]:
import pandas as pd
fit_df = pd.DataFrame(fit.toarray(), columns=vectorizer.get_feature_names_out())
fit_df

Unnamed: 0,able dance,able wear,able wear bra,able wear regular,able wear regular bra,absolutely beautiful,absolutely gorgeous,absolutely love,absolutely loved,absolutely loved dress,...,work event,worked fine,worked great,worked perfectly,worn dress,worn size,wouldnt recommend,wrap dress,year eve,zipper little
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
192351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
192352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
192353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
feat_names = pd.Series(vectorizer.get_feature_names_out())
feat_names

0                 able dance
1                  able wear
2              able wear bra
3          able wear regular
4      able wear regular bra
               ...          
995                worn size
996        wouldnt recommend
997               wrap dress
998                 year eve
999            zipper little
Length: 1000, dtype: object

In [31]:
feat_names[feat_names.str.contains('size')]

21         backup size
36         bigger size
71           bust size
124           cup size
146    definitely size
            ...       
942    wear size dress
954       wearing size
962          went size
982          wore size
995          worn size
Length: 87, dtype: object

In [32]:
fit_df['bigger size'].value_counts()

0    191695
1       647
2        13
Name: bigger size, dtype: int64

In [33]:
fit_df['bit small'].value_counts()

0    191280
1      1069
2         6
Name: bit small, dtype: int64