# Transforming data into features

**Objective:**  Transform some of the features to make the data more useful for analysis.

* Transforming categorical data.
* Scaling the data.
* Work with date-time features.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# Import the dataset
reviews = pd.read_csv("transform-data-into-features/reviews.csv")
reviews.head()

Unnamed: 0,clothing_id,age,review_title,review_text,recommended,division_name,department_name,review_date,rating
0,1095,39,"Cute,looks like a dress on",If you are afraid of the jumpsuit trend but li...,True,General,Dresses,2019-07-08,Liked it
1,1095,28,"So cute, great print!",I love fitted top dresses like this but i find...,True,General,Dresses,2019-05-17,Loved it
2,699,37,So flattering!,"I love these cozy, fashionable leggings. they ...",True,Initmates,Intimate,2019-06-24,Loved it
3,1072,36,Effortless,"Another reviewer said it best, ""i love the way...",True,General Petite,Dresses,2019-12-06,Loved it
4,1094,32,You need this!,Rompers are my fav so i'm biased writing this ...,True,General,Dresses,2019-10-04,Loved it


In [3]:
# Review the columns names
reviews.columns

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating'],
      dtype='object')

In [4]:
# Review the features data types
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      5000 non-null   int64 
 1   age              5000 non-null   int64 
 2   review_title     4174 non-null   object
 3   review_text      4804 non-null   object
 4   recommended      5000 non-null   bool  
 5   division_name    4996 non-null   object
 6   department_name  4996 non-null   object
 7   review_date      5000 non-null   object
 8   rating           5000 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 317.5+ KB


In [5]:
# Get the values for the recommended feature - ideally it should be transformed to numerical
reviews["recommended"]

0        True
1        True
2        True
3        True
4        True
        ...  
4995     True
4996    False
4997     True
4998     True
4999     True
Name: recommended, Length: 5000, dtype: bool

In [6]:
# Transform the recommended feature data type

# Creating a binary dictionary
binary_dict = {True:1, False: 0}
# Use the binary dictionary to transform the column
reviews["recommended"] = reviews["recommended"].map(binary_dict)
reviews["recommended"]

0       1
1       1
2       1
3       1
4       1
       ..
4995    1
4996    0
4997    1
4998    1
4999    1
Name: recommended, Length: 5000, dtype: int64

In [7]:
# Get the values for the rating feature as well
reviews["rating"]

0       Liked it
1       Loved it
2       Loved it
3       Loved it
4       Loved it
          ...   
4995    Loved it
4996    Hated it
4997    Loved it
4998    Loved it
4999    Loved it
Name: rating, Length: 5000, dtype: object

In [8]:
reviews["rating"].value_counts()

rating
Loved it     2798
Liked it     1141
Was okay      564
Not great     304
Hated it      193
Name: count, dtype: int64

In [9]:
# Transform the rating feature data type
# loved it: 5, liked it: 4, was okay: 3, not great: 2, hated it: 1

# Creating the dictionary
rating_dict = {"Loved it":5, "Liked it":4, "Was okay":3, "Not great":2, "Hated it": 1}
# Transform the column
reviews["rating"] = reviews["rating"].map(rating_dict)
reviews["rating"]

0       4
1       5
2       5
3       5
4       5
       ..
4995    5
4996    1
4997    5
4998    5
4999    5
Name: rating, Length: 5000, dtype: int64

In [10]:
# Get the values for the department_name feature

reviews["department_name"].value_counts()

department_name
Tops        2196
Dresses     1322
Bottoms      848
Intimate     378
Jackets      224
Trend         28
Name: count, dtype: int64

In [11]:
# Transform the feature data types using one-hot encode

# Perform get_dummies
ohe = pd.get_dummies(reviews["department_name"])
# Join the new columns onto the original
reviews = reviews.join(ohe)
reviews.columns

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating', 'Bottoms',
       'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
      dtype='object')

In [14]:
# Get the data for the reviews_date feature
reviews["review_date"]

0       2019-07-08
1       2019-05-17
2       2019-06-24
3       2019-12-06
4       2019-10-04
           ...    
4995    2019-05-26
4996    2019-10-21
4997    2019-10-18
4998    2019-11-24
4999    2019-10-31
Name: review_date, Length: 5000, dtype: object

In [18]:
# Transform review_date to date-time data
reviews["review_date"] = pd.to_datetime(reviews["review_date"])
reviews["review_date"]

0      2019-07-08
1      2019-05-17
2      2019-06-24
3      2019-12-06
4      2019-10-04
          ...    
4995   2019-05-26
4996   2019-10-21
4997   2019-10-18
4998   2019-11-24
4999   2019-10-31
Name: review_date, Length: 5000, dtype: datetime64[ns]

In [20]:
# Get numerical columns
reviews = reviews[['clothing_id', 'age', 'recommended', 'rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()
# Reset index
reviews = reviews.set_index(reviews["clothing_id"])

In [21]:
# Instantiate Standard Scaler
scaler = StandardScaler()
# Fit transform data
scaler.fit_transform(reviews)

array([[ 0.85669131, -0.34814459,  0.44742824, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [ 0.85669131, -1.24475223,  0.44742824, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-1.06545809, -0.51116416,  0.44742824, ..., -0.21656679,
        -0.88496718, -0.07504356],
       ...,
       [ 0.81300609, -0.59267395,  0.44742824, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [ 0.55574873, -1.24475223,  0.44742824, ..., -0.21656679,
        -0.88496718, -0.07504356],
       [-0.33251728,  1.68960003,  0.44742824, ..., -0.21656679,
         1.12998541, -0.07504356]])