In [50]:
%matplotlib inline

# To reload external scripts automatically
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

# Importing external files
import sys
sys.path.append('scripts/')
from data_import import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0-Exploratory
This notebook contain the first study of the data to identify what is needed and what we do not need. It will only work with subsets of datasets as they will not fit into memory.

In [51]:
DATA_FOLDER = "../../Project-Data/"
META_FOLDER = DATA_FOLDER + "meta/"
REVIEWS_FOLDER = DATA_FOLDER + "reviews/"
CATEGORIES = ['Books', 'CDs_and_Vinyl', 'Electronics', 'Movies_and_Tv']

## 0.0-Content of the files

The different columns of the **metadata** files are : 
* ```asin``` : the unique identifier of the object
* ```brand```
* ```categories``` : the categories of the object
* ```description``` : the description of the object
* ```imUrl```  : the link toward the images related to the object
* ```price```
* ```related``` : a list of objects that are related to this object
* ```salesRank``` 
* ```title```

The different columns of the **reviews** files are :
* ```asin``` : the unique identifier of the object
* ```helpful``` : a list of 2 integers [x,y], the helpfulness score is x/y votes
* ```overall``` : the rating of the object
* ```reviewText```
* ```reviewTime```
* ```reviewerID```
* ```reviewerName```
* ```summary ``` : the title of the review
* ```unixReviewTime``` : in Unix format

Therefore we keep only the column that are of interest for our task

In [52]:
meta_interesting_cols = ['asin', 'title', 'salesRank', 'description']
review_interesting_cols = ['asin', 'overall', 'unixReviewTime']

## 0.1-Books

In [54]:
meta_books_path, review_books_path = get_paths(0, DATA_FOLDER, META_FOLDER,
                                               REVIEWS_FOLDER, CATEGORIES)

### 0.1.1-Metadata

In [36]:
meta_books = import_interesting_cols(meta_books_path,True,meta_interesting_cols,max_count=100)
meta_books.head()

Unnamed: 0,asin,description,salesRank_Books,title
0,1048791,,6334800.0,"The Crucible: Performed by Stuart Pankin, Jero..."
1,1048775,William Shakespeare is widely regarded as the ...,13243226.0,Measure for Measure: Complete &amp; Unabridged
2,1048236,"&#34;One thing is certain, Sherlockians, put a...",8973864.0,The Sherlock Holmes Audio Collection
3,401048,,6448843.0,The rogue of publishers' row;: Confessions of ...
4,1019880,,9589258.0,Classic Soul Winner's New Testament Bible


### 0.1.2-Reviews

In [37]:
review_books = import_interesting_cols(
    review_books_path, False, review_interesting_cols, max_count=10000)
review_books.head()

Unnamed: 0,asin,overall,unixReviewTime
0,116,4.0,2002-04-27
1,116,1.0,2014-03-24
2,868,4.0,2002-09-11
3,13714,4.0,2013-10-31
4,13714,5.0,2013-07-27


## 0.2-CDs and Vinyls

In [56]:
meta_cd_path, review_cd_path = get_paths(1, DATA_FOLDER, META_FOLDER,
                                         REVIEWS_FOLDER, CATEGORIES)

### 0.2.1-Metadata

In [39]:
meta_cds = import_interesting_cols(
    meta_cd_path, True, meta_interesting_cols, max_count=100, dropna=False)
meta_cds.head()

Unnamed: 0,asin,description,salesRank_Movies_&_TV,salesRank_Music,title
0,1501348,"Lenny LeBlanc, Alex Acuna, Justo Almario, Tom ...",359265.0,,Lift Him Up With Ron Kenoly [VHS]
1,1393774,Audio CD,,41017.0,Songs for the Shepherd
2,5123909,18 Music Videos for Kids: Do Your Ears Hang Lo...,451209.0,,Silly Songs: 18 Wholesome Fun Songs for Kids [...
3,5072298,,,350804.0,Hymns: 16 Classic Hymns for Children
4,5224896,,,347825.0,"Voice of the Wind: Personal Worship, Vol. 1"


### 0.2.2-Reviews

In [40]:
review_cds = import_interesting_cols(
    review_cd_path,
    False,
    review_interesting_cols,
    max_count=100,
    dropna=False)
review_cds.head()

Unnamed: 0,asin,overall,unixReviewTime
0,1393774,5.0,2013-08-31
1,1393774,5.0,2013-07-02
2,1393774,5.0,2014-04-02
3,1393774,5.0,2014-02-15
4,1393774,5.0,2005-11-01


## 0.3-Electronics

In [57]:
meta_electronic_path, review_electronic_path = get_paths(
    2, DATA_FOLDER, META_FOLDER, REVIEWS_FOLDER, CATEGORIES)

### 0.3.1-Metadata

In [58]:
meta_electronic = import_interesting_cols(
    meta_electronic_path,
    True,
    meta_interesting_cols,
    max_count=100,
    dropna=False)
meta_electronic.head(2)

Unnamed: 0,asin,description,salesRank_Cell_Phones_&_Accessories,salesRank_Electronics,salesRank_Software,salesRank_Sports_Outdoors,title
0,132793040,The Kelby Training DVD Mastering Blend Modes i...,,,,,Kelby Training DVD: Mastering Blend Modes in A...
1,321732944,,,,,,Kelby Training DVD: Adobe Photoshop CS5 Crash ...


### 0.3.2-Review

In [48]:
review_electronic = import_interesting_cols(
    review_electronic_path,
    False,
    review_interesting_cols,
    max_count=100,
    dropna=False)
review_electronic.head()

Unnamed: 0,asin,overall,unixReviewTime
0,132793040,5.0,2013-04-13
1,321732944,5.0,2012-07-01
2,439886341,1.0,2013-04-29
3,439886341,3.0,2013-07-22
4,439886341,1.0,2012-04-18
