In [32]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

### Importing the product reviews data.

In [2]:
product_reviews = pd.read_csv('reviews.csv')

In [3]:
product_reviews.head()

Unnamed: 0,asin,name,date,rating,review
0,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-06,1,I bought this hair oil after viewing so many g...
1,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-14,5,Used This Mama Earth Newly Launched Onion Oil ...
2,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-10-19,1,So bad product...My hair falling increase too ...
3,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-16,1,Product just smells similar to navarathna hair...
4,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-18,5,I have been trying different onion oil for my ...


In [4]:
product_reviews.shape

(2782, 5)

In [5]:
product_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2782 entries, 0 to 2781
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   asin    2782 non-null   object
 1   name    2782 non-null   object
 2   date    2782 non-null   object
 3   rating  2782 non-null   int64 
 4   review  2778 non-null   object
dtypes: int64(1), object(4)
memory usage: 108.8+ KB


# Missing Values

In [6]:
missing = pd.concat([product_reviews.isnull().sum(), 100 * product_reviews.isnull().mean()], axis=1)
missing.columns=['count', '%']
missing.sort_values(by=['count', '%'],ascending=True)

Unnamed: 0,count,%
asin,0,0.0
name,0,0.0
date,0,0.0
rating,0,0.0
review,4,0.143781


In [7]:
product_reviews.dropna(inplace=True)
product_reviews = product_reviews.reset_index()
product_reviews = product_reviews.drop(['index'], axis = 1) 
product_reviews.head()

Unnamed: 0,asin,name,date,rating,review
0,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-06,1,I bought this hair oil after viewing so many g...
1,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-14,5,Used This Mama Earth Newly Launched Onion Oil ...
2,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-10-19,1,So bad product...My hair falling increase too ...
3,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-16,1,Product just smells similar to navarathna hair...
4,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-18,5,I have been trying different onion oil for my ...


In [8]:
product_reviews.shape

(2778, 5)

### List of unique products

In [9]:
product_reviews['name'].nunique()

122

That is, we have a total of 122 unique products in the data set provided.


Number of reviews for each product 

In [11]:
product_reviews['name'].value_counts().head()

Tata-Tea-Gold-500g                               58
Dettol-Liquid-Refill-Original-1500               40
Mamaearth-Natural-Turmeric-Saffron-brightning    40
MYSORE-SANDAL-Mysore-Sandal-Talcum               40
Cinthol-Original-Soap-100g-Pack                  40
Name: name, dtype: int64

### Now we will try to find the unique brands that these products belong to and the number of reviews each brand has. We can do this as we can see a pattern in the name and extracting the first word would give us the brand associated with the product.

In [12]:
product_reviews['brandName'] = product_reviews['name'].str.split('-').str[0]
product_reviews.head()

Unnamed: 0,asin,name,date,rating,review,brandName
0,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-06,1,I bought this hair oil after viewing so many g...,Mamaearth
1,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-14,5,Used This Mama Earth Newly Launched Onion Oil ...,Mamaearth
2,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-10-19,1,So bad product...My hair falling increase too ...,Mamaearth
3,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-09-16,1,Product just smells similar to navarathna hair...,Mamaearth
4,B07W7CTLD1,Mamaearth-Onion-Growth-Control-Redensyl,2019-08-18,5,I have been trying different onion oil for my ...,Mamaearth


In [13]:
product_reviews['brandName'].unique()

array(['Mamaearth', 'Godrej', 'Titan', 'Maaza', 'Paper', 'Indiana',
       'Coca', 'Natural', 'Maggi', 'Glucon', 'Amul', 'Patanjali',
       'PATANJALI', 'Dettol', 'Savlon', 'Cinthol', 'Britannia',
       'NutriChoice', 'Streax', 'Himalaya', 'Society', 'Tata', 'Fastrack',
       'Reflex', 'MYSORE', 'Mysore'], dtype=object)

In [14]:
product_reviews['brandName'] = product_reviews['brandName'].str.replace('PATANJALI', 'Patanjali')
product_reviews['brandName'] = product_reviews['brandName'].str.replace('MYSORE', 'Mysore')

In [15]:
product_reviews['brandName'].nunique()

24

####  There are a total of 24 unique brands that have their product reviews in the dataset

In [16]:
product_reviews['brandName'].value_counts().head()

Cinthol      200
Himalaya     200
Titan        200
Mamaearth    200
Godrej       200
Name: brandName, dtype: int64

In [30]:
product_reviews.to_csv('clean.csv')

#  We will split our data into training and test sets and save it for future references

In [18]:
X = product_reviews['review']
y = product_reviews['rating']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [24]:
train_df = pd.DataFrame()

In [25]:
train_df['review'] = X_train

In [26]:
train_df['rating'] = y_train

In [28]:
test_df = pd.DataFrame()
test_df['review'] = X_test
test_df['rating'] = y_test

In [29]:
train_df.to_csv('train.csv')
test_df.to_csv('test.csv')