In [None]:
# https://www.kaggle.com/nicapotato/guided-numeric-and-text-exploration-e-commerce

In [1]:
# General
import numpy as np
import pandas as pd
import nltk
import random
import os
from os import path
from PIL import Image

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

# Set Plot Theme
sns.set_palette([
    "#30a2da",
    "#fc4f30",
    "#e5ae38",
    "#6d904f",
    "#8b8b8b",
])
# Alternate # plt.style.use('fivethirtyeight')

# Pre-Processing
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer

# Modeling
import statsmodels.api as sm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk.util import ngrams
from collections import Counter
from gensim.models import word2vec

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("E:/Kaggle Practice/Womens Clothing E-Commerce Reviews/Womens Clothing E-Commerce Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [None]:
# Clothing ID: Integer Categorical variable that refers to the specific piece being reviewed.
# Age: Positive Integer variable of the reviewers age.
# Title: String variable for the title of the review.
# Review Text: String variable for the review body.
# Rating: Positive Ordinal Integer variable for the product score granted by the customer from 1 Worst, to 5 Best.
# Recommended IND: Binary variable stating where the customer recommends the product where 1 is recommended, 0 is not recommended.
# Positive Feedback Count: Positive Integer documenting the number of other customers who found this review positive.
# Division Name: Categorical name of the product high level division.
# Department Name: Categorical name of the product department name.
# Class Name: Categorical name of the product class name.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [4]:
df.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [None]:
#Code Explanation and Reasoning:
#These packages are separated in four categories: General, Visualization, Pre-Processing, and Modeling.

# The General category includes the basic data manipulation tools for scientific computation (numpy), dataframes (pandas), Natural Language Processing (NLTK), path directory manipulation (os), and image saving (PIL).

# The Visualization section enables the creation of simple graphics (matplotlib, seaborn), as well as wordcloud's text frequency visualization.

# The Pre-Processing section extracts more specialized modules from the NLTK package such as tokenizers and stemmers to enable the preparation of text data for mathematical analysis.

# The Modeling section includes nltk’s sentiment analysis module, which can determine the mood of text, NLTK’s N-grams, and gensim.models’s word2vec. It also includes statsmodels.api which offers an array of linear models.

In [5]:
df.drop(df.columns[0],inplace=True, axis=1)
# 인덱스 컬럼은 드롭. inplace=True는 df에 바로 적용하는거고, axis=1은 컬럼을 의미하므로 컬럼[0]을 드롭하는데 바로 적용하라는 뜻

# Delete missing observations for following variables
for x in "Division Name","Department Name","Class Name","Review Text":
    df = df[df[x].notnull()]
# 이 데이터셋에는 "Title","Division Name","Department Name","Class Name","Review Text" 5가지의 컬럼에 missing value가 있는데
# 리뷰에서 "Review Text"가 중요하므로 리뷰가 없으면 삭제하고 "Title"은 큰 의미가 없기때문에 냅두고, 나머지 3개의 컬럼에서
# missig value가 있으면 삭제한다
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22628 entries, 0 to 23485
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Clothing ID              22628 non-null  int64 
 1   Age                      22628 non-null  int64 
 2   Title                    19662 non-null  object
 3   Review Text              22628 non-null  object
 4   Rating                   22628 non-null  int64 
 5   Recommended IND          22628 non-null  int64 
 6   Positive Feedback Count  22628 non-null  int64 
 7   Division Name            22628 non-null  object
 8   Department Name          22628 non-null  object
 9   Class Name               22628 non-null  object
dtypes: int64(5), object(5)
memory usage: 1.9+ MB


In [6]:
df.isnull().sum()

Clothing ID                   0
Age                           0
Title                      2966
Review Text                   0
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                 0
Department Name               0
Class Name                    0
dtype: int64

In [7]:
# Extracting Missing Count and Unique Count by Column
unique_count = []
for x in df.columns:
    unique_count.append([x,len(df[x].unique()),df[x].isnull().sum()])
unique_count

[['Clothing ID', 1172, 0],
 ['Age', 77, 0],
 ['Title', 13984, 2966],
 ['Review Text', 22621, 0],
 ['Rating', 5, 0],
 ['Recommended IND', 2, 0],
 ['Positive Feedback Count', 82, 0],
 ['Division Name', 3, 0],
 ['Department Name', 6, 0],
 ['Class Name', 20, 0]]

In [8]:
# Missing Values
print("Missing Values: {}".format(df.isnull().sum().sum()))

Missing Values: 2966


In [9]:
print("Dataframe Dimension: {} Rows, {} Columns".format(*df.shape))

Dataframe Dimension: 22628 Rows, 10 Columns


In [12]:
# Create New Variables:
# Word Length 
df["Word Count"] = df['Review Text'].str.split().apply(len)

# Character Length
df["Character Count"] = df["Review Text"].apply(len)

# Boolean for Positive and Negative Reviews
df["Label"] = 0
df.loc[df.Rating >= 3, ["Label"]] = 1

In [13]:
df.sample(3)

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Word Count,Character Count,Label
9953,1035,33,Super comfortable,These may be my new go-to pair of jeans. they ...,4,1,0,General,Bottoms,Jeans,43,220,1
40,862,47,,Pretty and unique. great with jeans or i have ...,4,1,1,General,Tops,Knits,26,149,1
12154,1068,42,Great casual crop,Great casual pant for the summer. wish there w...,5,1,0,General,Bottoms,Pants,43,241,1


## Univariate Distribution
### To start off my analysis, I will first take a look at the distribution of individual variables. This is a good way to see what I am up against, and understand the context of the subsequent multi-variate analysis.

In [16]:
print("Dataframe Dimension: {} Rows, {} Columns".format(*df.shape))

Dataframe Dimension: 22628 Rows, 13 Columns


In [17]:
pd.DataFrame(unique_count, columns=["Column","Unique","Missing"]).set_index("Column").T

Column,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
Unique,1172,77,13984,22621,5,2,82,3,6,20
Missing,0,0,2966,0,0,0,0,0,0,0


## nterpretation
### There are approximately 3000 missing values, which represents 1% of the dataset, but the dataset will not get trimmed further since the review text body is the only variable that must be complete.

### Amongst the categorical variables, the high unique count of Clothing ID and Class Names will require non-visual exploratory methods.

In [18]:
df.describe()

Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count,Word Count,Character Count,Label
count,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0,22628.0
mean,919.695908,43.28288,4.183092,0.818764,2.631784,60.21195,308.761534,0.895263
std,201.683804,12.328176,1.115911,0.385222,5.78752,28.533053,143.934126,0.306222
min,1.0,18.0,1.0,0.0,0.0,2.0,9.0,0.0
25%,861.0,34.0,4.0,1.0,0.0,36.0,186.0,1.0
50%,936.0,41.0,5.0,1.0,1.0,59.0,302.0,1.0
75%,1078.0,52.0,5.0,1.0,3.0,88.0,459.0,1.0
max,1205.0,99.0,5.0,1.0,122.0,115.0,508.0,1.0


In [19]:
df.describe().T 
# 위의 결과랑 가로, 세로가 바뀜

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Clothing ID,22628.0,919.695908,201.683804,1.0,861.0,936.0,1078.0,1205.0
Age,22628.0,43.28288,12.328176,18.0,34.0,41.0,52.0,99.0
Rating,22628.0,4.183092,1.115911,1.0,4.0,5.0,5.0,5.0
Recommended IND,22628.0,0.818764,0.385222,0.0,1.0,1.0,1.0,1.0
Positive Feedback Count,22628.0,2.631784,5.78752,0.0,0.0,1.0,3.0,122.0
Word Count,22628.0,60.21195,28.533053,2.0,36.0,59.0,88.0,115.0
Character Count,22628.0,308.761534,143.934126,9.0,186.0,302.0,459.0,508.0
Label,22628.0,0.895263,0.306222,0.0,1.0,1.0,1.0,1.0


In [22]:
df.describe().T.drop("count",axis=1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Clothing ID,919.695908,201.683804,1.0,861.0,936.0,1078.0,1205.0
Age,43.28288,12.328176,18.0,34.0,41.0,52.0,99.0
Rating,4.183092,1.115911,1.0,4.0,5.0,5.0,5.0
Recommended IND,0.818764,0.385222,0.0,1.0,1.0,1.0,1.0
Positive Feedback Count,2.631784,5.78752,0.0,0.0,1.0,3.0,122.0
Word Count,60.21195,28.533053,2.0,36.0,59.0,88.0,115.0
Character Count,308.761534,143.934126,9.0,186.0,302.0,459.0,508.0
Label,0.895263,0.306222,0.0,1.0,1.0,1.0,1.0


In [28]:
df.describe(include=["O"]).T
# df.describe().T 이렇게 출력하면 숫자형 자료들의 mean, min, max 등이 출력되는데
# df.describe(include=["O"]).T 이렇게 include=["O"] 을 넣어서 출력하면 
# categorical(문자형) 데이터들의 unique, top, freq의 정보가 출력된다.

Unnamed: 0,count,unique,top,freq
Title,19662,13983,Love it!,136
Review Text,22628,22621,Perfect fit and i've gotten so many compliment...,3
Division Name,22628,3,General,13365
Department Name,22628,6,Tops,10048
Class Name,22628,20,Dresses,6145


In [29]:
df[["Title", "Division Name", "Department Name", "Class Name"]].describe().T.drop("count",axis=1)

Unnamed: 0,unique,top,freq
Title,13983,Love it!,136
Division Name,3,General,13365
Department Name,6,Tops,10048
Class Name,20,Dresses,6145


In [None]:
# Just an overview. I want to explore these numbers using visualizations.

### Age and Positive Feedback Count Distributions:

In [None]:
# Continuous Distributions

f, ax = plt.subplots(1,3,figsize=(12,4), sharey=False)
sns.distplot(df.Age, ax=ax[0])