In [9]:
import pandas as pd
import numpy as np
import pickle

In [292]:
google_df = pd.read_csv('./datasets/Amazon-GoogleProducts/GoogleProducts.csv', encoding='unicode_escape')

In [293]:
google_df.columns = ['source_id', 'name', 'description', 'manufacturer', 'price']

In [294]:
google_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,38.99
1,http://www.google.com/base/feeds/snippets/1153...,superstart! fun with reading & writing!,fun with reading & writing! is designed to hel...,,8.49
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6.0 basic software,qb pos 6.0 basic retail mngmt software. for re...,intuit,637.99
3,http://www.google.com/base/feeds/snippets/1204...,math missions: the amazing arcade adventure (g...,save spectacle city by disrupting randall unde...,,12.95
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,805.99


In [295]:
google_df.isnull().sum(axis=0)

source_id          0
name               0
description      191
manufacturer    2994
price              0
dtype: int64

In [296]:
amazon_df = pd.read_csv('./datasets/Amazon-GoogleProducts/Amazon.csv', encoding='unicode_escape')

In [297]:
amazon_df.columns = ['source_id', 'name', 'description', 'manufacturer', 'price']

In [298]:
amazon_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,b000jz4hqo,clickart 950 000 - premier image pack (dvd-rom),,broderbund,0.0
1,b0006zf55o,ca international - arcserve lap/desktop oem 30pk,oem arcserve backup v11.1 win 30u for laptops ...,computer associates,0.0
2,b00004tkvy,noah's ark activity center (jewel case ages 3-8),,victory multimedia,0.0
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99
4,b0006se5bq,singing coach unlimited,singing coach unlimited - electronic learning ...,carry-a-tune technologies,99.99


In [299]:
amazon_df.isnull().sum(axis=0)

source_id         0
name              0
description     115
manufacturer      0
price             0
dtype: int64

In [300]:
# Replace all NaN
string_cols = ['name', 'description', 'manufacturer']
for col in string_cols:
    google_df[col].fillna('Unknown', inplace=True)
    amazon_df[col].fillna('Unknown', inplace=True)

#     google_df[col] = [val and val.lower().strip() for val in list(google_df[col])]
#     amazon_df[col] = [val and val.lower().strip() for val in list(amazon_df[col])]

In [301]:
google_df.isnull().sum(axis=0)    

source_id       0
name            0
description     0
manufacturer    0
price           0
dtype: int64

In [302]:
amazon_df.isnull().sum(axis=0)

source_id       0
name            0
description     0
manufacturer    0
price           0
dtype: int64

In [303]:
# Convert price column to float
import re
amazon_df['price'].astype('float', errors='ignore')
google_df['price'] = [ float(re.sub('\D', '', val)) for val in list(google_df['price'])]

In [304]:
# Convert all text to lower
def to_lower(row):
    for col in string_cols:
        row[col] = row[col] and row[col].lower().strip() 
        row[col] = re.sub('\W', ' ', row[col])
        row[col] = re.sub('\s{2,}', ' ', row[col])
    return row

google_df = google_df.apply(to_lower, axis='columns')
amazon_df = amazon_df.apply(to_lower, axis='columns')

In [307]:
google_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,3899.0
1,http://www.google.com/base/feeds/snippets/1153...,superstart fun with reading writing,fun with reading writing is designed to help k...,unknown,849.0
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6 0 basic software,qb pos 6 0 basic retail mngmt software for ret...,intuit,63799.0
3,http://www.google.com/base/feeds/snippets/1204...,math missions the amazing arcade adventure gra...,save spectacle city by disrupting randall unde...,unknown,1295.0
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,80599.0


In [308]:
amazon_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,b000jz4hqo,clickart 950 000 premier image pack dvd rom,unknown,broderbund,0.0
1,b0006zf55o,ca international arcserve lap desktop oem 30pk,oem arcserve backup v11 1 win 30u for laptops ...,computer associates,0.0
2,b00004tkvy,noah s ark activity center jewel case ages 3 8,unknown,victory multimedia,0.0
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99
4,b0006se5bq,singing coach unlimited,singing coach unlimited electronic learning pr...,carry a tune technologies,99.99


In [309]:
google_df['source'] = 'google'

In [310]:
amazon_df['source'] = 'amazon'

In [311]:
google_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price,source
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,3899.0,google
1,http://www.google.com/base/feeds/snippets/1153...,superstart fun with reading writing,fun with reading writing is designed to help k...,unknown,849.0,google
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6 0 basic software,qb pos 6 0 basic retail mngmt software for ret...,intuit,63799.0,google
3,http://www.google.com/base/feeds/snippets/1204...,math missions the amazing arcade adventure gra...,save spectacle city by disrupting randall unde...,unknown,1295.0,google
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,80599.0,google


In [312]:
amazon_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price,source
0,b000jz4hqo,clickart 950 000 premier image pack dvd rom,unknown,broderbund,0.0,amazon
1,b0006zf55o,ca international arcserve lap desktop oem 30pk,oem arcserve backup v11 1 win 30u for laptops ...,computer associates,0.0,amazon
2,b00004tkvy,noah s ark activity center jewel case ages 3 8,unknown,victory multimedia,0.0,amazon
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99,amazon
4,b0006se5bq,singing coach unlimited,singing coach unlimited electronic learning pr...,carry a tune technologies,99.99,amazon


In [313]:
df = pd.concat([google_df, amazon_df])

In [314]:
df

Unnamed: 0,source_id,name,description,manufacturer,price,source
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,3899.00,google
1,http://www.google.com/base/feeds/snippets/1153...,superstart fun with reading writing,fun with reading writing is designed to help k...,unknown,849.00,google
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6 0 basic software,qb pos 6 0 basic retail mngmt software for ret...,intuit,63799.00,google
3,http://www.google.com/base/feeds/snippets/1204...,math missions the amazing arcade adventure gra...,save spectacle city by disrupting randall unde...,unknown,1295.00,google
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,80599.00,google
...,...,...,...,...,...,...
1358,b000cs3s2c,flash remoting 1 alp ret eng cd 2u,marketing information macromedia flash remoti...,adobe,3314.09,amazon
1359,b00005bigp,shapes,unknown,school zone,9.99,amazon
1360,b000h1df7w,dragon naturally speaking standard v9,dragon naturallyspeaking 9 standard edition gi...,nuance communications inc,99.99,amazon
1361,b000p9cr66,mediarecover,mediarecover gives you the ability to recover ...,aladdin systems,29.99,amazon
