In [1]:
%matplotlib inline

# To reload external scripts automatically
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

# Importing external files
import sys
sys.path.append('scripts/')
from data_import import *
from similarities import *

# 0-Exploratory
This notebook contain the first study of the data to identify what is needed and what we do not need. It will only work with subsets of datasets as they will not fit into memory.

In [2]:
DATA_FOLDER = "../../Project-Data/"
META_FOLDER = DATA_FOLDER + "meta/"
REVIEWS_FOLDER = DATA_FOLDER + "reviews/"
CORE_FOLDER = DATA_FOLDER + "5_core/"
DUMP_FOLDER = DATA_FOLDER + "dump/"
CATEGORIES = ['Books','Movies_and_Tv','Electronics']
MAXCOUNT = 10

## 0.0-Content of the files

The different columns of the **metadata** files are : 
* ```asin``` : the unique identifier of the object
* ```brand```
* ```categories``` : the categories of the object
* ```description``` : the description of the object
* ```imUrl```  : the link toward the images related to the object
* ```price```
* ```related``` : a list of objects that are related to this object
* ```salesRank``` 
* ```title```

The different columns of the **reviews** and **5-core** files are :
* ```asin``` : the unique identifier of the object
* ```helpful``` : a list of 2 integers [x,y], the helpfulness score is x/y votes
* ```overall``` : the rating of the object
* ```reviewText```
* ```reviewTime```
* ```reviewerID```
* ```reviewerName```
* ```summary ``` : the title of the review
* ```unixReviewTime``` : in Unix format

Therefore we keep only the column that are of interest for our task

In [3]:
meta_interesting_cols = ['asin', 'title', 'salesRank', 'description']
review_interesting_cols = ['asin', 'overall', 'unixReviewTime']

## 0.1-Books

In [4]:
meta_books_path, review_books_path, core_book_path = get_paths(0, DATA_FOLDER, META_FOLDER,CORE_FOLDER,
                                               REVIEWS_FOLDER, CATEGORIES)

Paths : 
	 meta = ../../Project-Data/meta/meta_Books.json
	 review = ../../Project-Data/reviews/reviews_Books.json
	 core_path = ../../Project-Data/5_core/Books.json


### 0.1.1-Metadata

In [5]:
meta_books = import_interesting_cols(meta_books_path,DUMP_FOLDER,True,meta_interesting_cols,max_count=MAXCOUNT)
meta_books.head()

Retrieving from : ../../Project-Data/dump/meta_Books_asin_title_salesRank_description_10
It took 00:00:00.002 to import the data.


Unnamed: 0,asin,description,salesRank_Books,title
0,1048791,,6334800.0,"The Crucible: Performed by Stuart Pankin, Jero..."
1,1048775,William Shakespeare is widely regarded as the ...,13243226.0,Measure for Measure: Complete & Unabridged
2,1048236,"""One thing is certain, Sherlockians, put aside...",8973864.0,The Sherlock Holmes Audio Collection
3,401048,,6448843.0,The rogue of publishers' row;: Confessions of ...
4,1019880,,9589258.0,Classic Soul Winner's New Testament Bible


### 0.1.2-Reviews

In [6]:
review_books = import_interesting_cols(
    review_books_path, DUMP_FOLDER,False, review_interesting_cols, max_count=MAXCOUNT)
review_books.head()

Retrieving from : ../../Project-Data/dump/reviews_Books_asin_overall_unixReviewTime_10
It took 00:00:00.002 to import the data.


Unnamed: 0,asin,overall,unixReviewTime
0,116,4.0,2002-04-27
1,116,1.0,2014-03-24
2,868,4.0,2002-09-11
3,13714,4.0,2013-10-31
4,13714,5.0,2013-07-27


### 0.1.3 - 5-Core

In [7]:
core_books = import_interesting_cols(
    core_book_path,
    DUMP_FOLDER,
    False,
    review_interesting_cols,
    max_count=MAXCOUNT)
core_books.head()

Retrieving from : ../../Project-Data/dump/Books_asin_overall_unixReviewTime_10
It took 00:00:00.002 to import the data.


Unnamed: 0,asin,overall,unixReviewTime
0,000100039X,5.0,2012-12-16
1,000100039X,5.0,2003-12-11
2,000100039X,5.0,2014-01-18
3,000100039X,5.0,2011-09-27
4,000100039X,5.0,2002-10-07


## 0.2-Movies and TV

In [8]:
meta_movie_path, review_movie_path, core_movie_path = get_paths(
    1, DATA_FOLDER, META_FOLDER, CORE_FOLDER, REVIEWS_FOLDER, CATEGORIES)

Paths : 
	 meta = ../../Project-Data/meta/meta_Movies_and_Tv.json
	 review = ../../Project-Data/reviews/reviews_Movies_and_Tv.json
	 core_path = ../../Project-Data/5_core/Movies_and_Tv.json


### 0.2.1-Metadata

In [9]:
meta_movie = import_interesting_cols(
    meta_movie_path,DUMP_FOLDER, True, meta_interesting_cols, max_count=MAXCOUNT, dropna=False)
meta_movie.head()

It took 00:00:00.006 to import the data.
Saved at : ../../Project-Data/dump/meta_Movies_and_Tv_asin_title_salesRank_description_10


Unnamed: 0,asin,description,salesRank_Movies_&_TV,title
0,0000143561,"3Pack DVD set - Italian Classics, Parties and ...",376041,"Everyday Italian (with Giada de Laurentiis), V..."
1,0000589012,,1084845,Why Don't They Just Quit? DVD Roundtable Discu...
2,0000695009,,1022732,Understanding Seizures and Epilepsy DVD
3,000107461X,,954116,Live in Houston [VHS]
4,0000143529,Disc 1: Flour Power (Scones; Shortcakes; South...,463562,My Fair Pastry (Good Eats Vol. 9)


### 0.2.2-Reviews

In [11]:
review_movie = import_interesting_cols(
    review_movie_path,
    DUMP_FOLDER,
    False,
    review_interesting_cols,
    max_count=100,
    dropna=False)
review_movie.head()

Retrieving from : ../../Project-Data/dump/reviews_Movies_and_Tv_asin_overall_unixReviewTime_100
It took 00:00:00.001 to import the data.


Unnamed: 0,asin,overall,unixReviewTime
0,143502,5.0,2013-01-17
1,143529,5.0,2013-10-02
2,143561,2.0,2008-07-17
3,143588,5.0,2009-03-13
4,143588,5.0,2009-01-18


## 0.3-Electronics

In [12]:
meta_electronic_path, review_electronic_path, core_electronic_path = get_paths(
    2, DATA_FOLDER, CORE_FOLDER, META_FOLDER, REVIEWS_FOLDER, CATEGORIES)

Paths : 
	 meta = ../../Project-Data/5_core/meta_Electronics.json
	 review = ../../Project-Data/reviews/reviews_Electronics.json
	 core_path = ../../Project-Data/meta/Electronics.json


### 0.3.1-Metadata

In [13]:
meta_electronic = import_interesting_cols(
    meta_electronic_path,
    DUMP_FOLDER,
    True,
    meta_interesting_cols,
    max_count=MAXCOUNT,
    dropna=False)
meta_electronic.head(2)

Retrieving from : ../../Project-Data/dump/meta_Electronics_asin_title_salesRank_description_10
It took 00:00:00.001 to import the data.


Unnamed: 0,asin,description,salesRank_Electronics,title
0,132793040,The Kelby Training DVD Mastering Blend Modes i...,,Kelby Training DVD: Mastering Blend Modes in A...
1,321732944,,,Kelby Training DVD: Adobe Photoshop CS5 Crash ...


### 0.3.2-Review

In [14]:
review_electronic = import_interesting_cols(
    review_electronic_path,
    DUMP_FOLDER,
    False,
    review_interesting_cols,
    max_count=MAXCOUNT,
    dropna=False)
review_electronic.head()

It took 00:00:00.005 to import the data.
Saved at : ../../Project-Data/dump/reviews_Electronics_asin_overall_unixReviewTime_10


Unnamed: 0,asin,overall,unixReviewTime
0,132793040,5.0,2013-04-13
1,321732944,5.0,2012-07-01
2,439886341,1.0,2013-04-29
3,439886341,3.0,2013-07-22
4,439886341,1.0,2012-04-18


## 1-Detecting similar products
In this section we try to obtain the products that are similar among a given category and to develop a systematic way to detect those similar products.

## 1.1-Books

Some books are sold in different formats : hard cover, pocket, electronic, etc. Therefore in order to compare them we will focus on the title and description that are the only two attributes that should be highly similar between the two different yet similar books.

In [None]:
meta_books = import_interesting_cols(meta_books_path,DUMP_FOLDER,True,meta_interesting_cols,max_count=MAXCOUNT)
meta_books.head()

In [None]:
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

#nltk.download('punkt') # if necessary...

stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)


def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]


def normalize(text):
    '''remove punctuation, lowercase, stem'''
    toReturn = stem_tokens(
        nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))
    return toReturn


vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')


def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0, 1]

We first try to experiment with tf-idf weights to find common titles among books. Due to the large size of the database, this task cannot be executed on all the books at first.

In [None]:
# First we drop the row if it doesn't have a title
meta_books_complete = meta_books[['title','description']].dropna(how='any')
print("We have {} book titles to base our analysis on.".format(meta_books_complete['title'].size))
meta_books_titles.head()

In [None]:
# We pick a random title 
book_1 = meta_books_complete.sample(n=1)
book_1

In [None]:
book_1.iloc[0,0]

In [None]:
sub_sample = meta_books_titles.sample(frac=0.01)
sub_sample['title_similarity'] = sub_sample['title'].map(lambda x: cosine_sim(x,book_1.iloc[0,0]))
sub_sample['description_similarity'] = sub_sample['description'].map(lambda x: cosine_sim(x,book_1.iloc[0,1]))
sub_sample.sort_values(['title_similarity'],ascending=False,inplace=True)
sub_sample.head()

In [None]:
df = pd.DataFrame(np.column_stack([sub_sample,similarities]))
df.columns = ['title','cos_sim']
df.sort_values(['cos_sim'],ascending=False,inplace=True)
df.head(20)