In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
google_df = pd.read_csv('./datasets/Amazon-GoogleProducts/GoogleProducts.csv', encoding='unicode_escape')

In [3]:
google_df.columns = ['source_id', 'name', 'description', 'manufacturer', 'price']

In [4]:
google_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,38.99
1,http://www.google.com/base/feeds/snippets/1153...,superstart! fun with reading & writing!,fun with reading & writing! is designed to hel...,,8.49
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6.0 basic software,qb pos 6.0 basic retail mngmt software. for re...,intuit,637.99
3,http://www.google.com/base/feeds/snippets/1204...,math missions: the amazing arcade adventure (g...,save spectacle city by disrupting randall unde...,,12.95
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,805.99


In [5]:
google_df.isnull().sum(axis=0)

source_id          0
name               0
description      191
manufacturer    2994
price              0
dtype: int64

In [6]:
amazon_df = pd.read_csv('./datasets/Amazon-GoogleProducts/Amazon.csv', encoding='unicode_escape')

In [7]:
amazon_df.columns = ['source_id', 'name', 'description', 'manufacturer', 'price']

In [8]:
amazon_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,b000jz4hqo,clickart 950 000 - premier image pack (dvd-rom),,broderbund,0.0
1,b0006zf55o,ca international - arcserve lap/desktop oem 30pk,oem arcserve backup v11.1 win 30u for laptops ...,computer associates,0.0
2,b00004tkvy,noah's ark activity center (jewel case ages 3-8),,victory multimedia,0.0
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99
4,b0006se5bq,singing coach unlimited,singing coach unlimited - electronic learning ...,carry-a-tune technologies,99.99


In [9]:
amazon_df.isnull().sum(axis=0)

source_id         0
name              0
description     115
manufacturer      0
price             0
dtype: int64

In [10]:
# Replace all NaN
string_cols = ['name', 'description', 'manufacturer']
for col in string_cols:
    google_df[col].fillna('Unknown', inplace=True)
    amazon_df[col].fillna('Unknown', inplace=True)

#     google_df[col] = [val and val.lower().strip() for val in list(google_df[col])]
#     amazon_df[col] = [val and val.lower().strip() for val in list(amazon_df[col])]

In [11]:
google_df.isnull().sum(axis=0)    

source_id       0
name            0
description     0
manufacturer    0
price           0
dtype: int64

In [12]:
amazon_df.isnull().sum(axis=0)

source_id       0
name            0
description     0
manufacturer    0
price           0
dtype: int64

In [13]:
# Convert price column to float
import re
amazon_df['price'].astype('float', errors='ignore')
google_df['price'] = [ float(re.sub('\D', '', val)) for val in list(google_df['price'])]

In [14]:
# Convert all text to lower
def to_lower(row):
    for col in string_cols:
        row[col] = row[col] and row[col].lower().strip() 
        row[col] = re.sub('\W', ' ', row[col])
        row[col] = re.sub('\s{2,}', ' ', row[col])
    return row

google_df = google_df.apply(to_lower, axis='columns')
amazon_df = amazon_df.apply(to_lower, axis='columns')

In [15]:
google_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,3899.0
1,http://www.google.com/base/feeds/snippets/1153...,superstart fun with reading writing,fun with reading writing is designed to help k...,unknown,849.0
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6 0 basic software,qb pos 6 0 basic retail mngmt software for ret...,intuit,63799.0
3,http://www.google.com/base/feeds/snippets/1204...,math missions the amazing arcade adventure gra...,save spectacle city by disrupting randall unde...,unknown,1295.0
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,80599.0


In [16]:
amazon_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price
0,b000jz4hqo,clickart 950 000 premier image pack dvd rom,unknown,broderbund,0.0
1,b0006zf55o,ca international arcserve lap desktop oem 30pk,oem arcserve backup v11 1 win 30u for laptops ...,computer associates,0.0
2,b00004tkvy,noah s ark activity center jewel case ages 3 8,unknown,victory multimedia,0.0
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99
4,b0006se5bq,singing coach unlimited,singing coach unlimited electronic learning pr...,carry a tune technologies,99.99


In [17]:
google_df['source'] = 'google'

In [18]:
amazon_df['source'] = 'amazon'

In [19]:
google_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price,source
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,3899.0,google
1,http://www.google.com/base/feeds/snippets/1153...,superstart fun with reading writing,fun with reading writing is designed to help k...,unknown,849.0,google
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6 0 basic software,qb pos 6 0 basic retail mngmt software for ret...,intuit,63799.0,google
3,http://www.google.com/base/feeds/snippets/1204...,math missions the amazing arcade adventure gra...,save spectacle city by disrupting randall unde...,unknown,1295.0,google
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,80599.0,google


In [20]:
amazon_df.head()

Unnamed: 0,source_id,name,description,manufacturer,price,source
0,b000jz4hqo,clickart 950 000 premier image pack dvd rom,unknown,broderbund,0.0,amazon
1,b0006zf55o,ca international arcserve lap desktop oem 30pk,oem arcserve backup v11 1 win 30u for laptops ...,computer associates,0.0,amazon
2,b00004tkvy,noah s ark activity center jewel case ages 3 8,unknown,victory multimedia,0.0,amazon
3,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99,amazon
4,b0006se5bq,singing coach unlimited,singing coach unlimited electronic learning pr...,carry a tune technologies,99.99,amazon


In [21]:
df = pd.concat([google_df, amazon_df])

In [22]:
df

Unnamed: 0,source_id,name,description,manufacturer,price,source
0,http://www.google.com/base/feeds/snippets/1112...,learning quickbooks 2007,learning quickbooks 2007,intuit,3899.00,google
1,http://www.google.com/base/feeds/snippets/1153...,superstart fun with reading writing,fun with reading writing is designed to help k...,unknown,849.00,google
2,http://www.google.com/base/feeds/snippets/1134...,qb pos 6 0 basic software,qb pos 6 0 basic retail mngmt software for ret...,intuit,63799.00,google
3,http://www.google.com/base/feeds/snippets/1204...,math missions the amazing arcade adventure gra...,save spectacle city by disrupting randall unde...,unknown,1295.00,google
4,http://www.google.com/base/feeds/snippets/1224...,production prem cs3 mac upgrad,adobe cs3 production premium mac upgrade from ...,adobe software,80599.00,google
...,...,...,...,...,...,...
1358,b000cs3s2c,flash remoting 1 alp ret eng cd 2u,marketing information macromedia flash remoti...,adobe,3314.09,amazon
1359,b00005bigp,shapes,unknown,school zone,9.99,amazon
1360,b000h1df7w,dragon naturally speaking standard v9,dragon naturallyspeaking 9 standard edition gi...,nuance communications inc,99.99,amazon
1361,b000p9cr66,mediarecover,mediarecover gives you the ability to recover ...,aladdin systems,29.99,amazon


In [23]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [24]:
doc = nlp("Tea is healthy and calming, don't you think?")

In [49]:
google_df.shape

(3226, 6)

In [50]:
amazon_df.shape

(1363, 6)

In [69]:
a = '123'

In [73]:
list(a).pop(0)

'1'

In [75]:
a = set()

In [76]:
len(a)

0

In [77]:
a = '123'

In [78]:
set(a)

{'1', '2', '3'}

In [93]:
from functools import reduce
dict_s = {'a': 2, 'b': 2, 'c': 1}
balance = reduce(lambda a, b: sum(list(dict_s.values()))/len(list(dict_s.values())) - b, list(dict_s.values()))

In [96]:
balance

0.6666666666666667

In [109]:
def isValid(s):
    # Write your code here
    dict_s = {}
    for c in s:
        if c in dict_s:
            dict_s[c] += 1
        else:
            dict_s[c] = 1
        
    inv_dict = defaultdict(list)
    for k, v in dict_s.items():
        inv_dict[v].append(k)
        
    set_values = list(set(dict_s.values()))
    if len(set_values) > 2:
        return 'NO'
    if len(set_values) == 1:
        return 'YES'
    if len(set_values) == 2:
        set_a = inv_dict[set_values[0]]
        set_b = inv_dict[set_values[1]]
        print(set_values[0], set_a)
        print(set_values[1], set_b)        
        if (set_values[0] == 1 and len(set_a) == 1) or (set_values[1] == 1 and len(set_b) == 1):
            return 'YES'
        elif abs(set_values[0] - set_values[1]) == 1:
            min_val_set = set_b if len(set_b) < len(set_a) else set_a
            return 'YES' if len(min_val_set) == 1 else 'NO'
        return 'NO' 

In [110]:
isValid('abcdefghhgfedecba')

2 ['a', 'b', 'c', 'd', 'f', 'g', 'h']
3 ['e']


'YES'

In [115]:
sub = '121'
math.ceil(len(sub)/2)

2

In [112]:
import math

In [84]:
import math
import os
import random
import re
import sys
from itertools import combinations

# Complete the substrCount function below.
def substrCount(n, s):
    def is_special(sub, l):
        if not sub:
            return False
        if l == 1:
            return True
        set_sub = set(sub)
        if len(set_sub) == 1:
            return True
        if l % 2 == 0:
            return False
        else:
            return len(set_sub) == 2 and sub[math.floor(l/2)] != sub[0] and sub[math.floor(l/2)] != sub[-1]
    
    special_strs = n
    for x in range(n):
        for y in range(x+2, n+1):
            if is_special(s[x:y], y-x):
                special_strs += 1
               
    return special_strs

In [85]:
substrCount(8, 'mnonopoo')

non
ono
opo
oo


12

In [99]:
s1 = 'HARRY'
s2 = 'SALLY'
print(sorted([s1, s2]))

['HARRY', 'SALLY']


In [100]:
def commonChild(s1, s2):
    # Write your code here
    s1, s2 = sorted([s1, s2])
    common = 0
    remember_jdx = 0   
    jdx = 0 
    start = 0
    idx = 0
    while (idx < len(s1)):
        if jdx == len(s2):
            jdx = remember_jdx
        while (jdx < len(s2)):
            if s1[idx] == s2[jdx]:
                common += 1
                jdx += 1
                remember_jdx = jdx
                break
            jdx += 1
        idx += 1
            
    return common

In [101]:
s1 = 'WEWOUCUIDGCGTRMEZEPXZFEJWISRSBBSYXAYDFEJJDLEBVHHKS'