In [0]:
!pip install eli5



In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [0]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
df = pd.read_csv('data/men_shoes.csv' , low_memory=False)

In [0]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension', 'ean', 'features',
       'flavors', 'imageurls', 'isbn', 'keys', 'manufacturer',
       'manufacturernumber', 'merchants', 'name', 'prices_amountmin',
       'prices_amountmax', 'prices_availability', 'prices_color',
       'prices_condition', 'prices_count', 'prices_currency',
       'prices_dateadded', 'prices_dateseen', 'prices_flavor', 'prices_issale',
       'prices_merchant', 'prices_offer', 'prices_returnpolicy',
       'prices_shipping', 'prices_size', 'prices_source', 'prices_sourceurls',
       'prices_warranty', 'quantities', 'reviews', 'sizes', 'skus',
       'sourceurls', 'upc', 'vin', 'websiteids', 'weight'],
      dtype='object')

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  x = df[ feats ].values
  y = df['prices_amountmin' ].values

  scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [0]:
df['brand_cat'] = df[ 'brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [0]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [0]:
str_dict = '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]'
literal_eval(str_dict)

[{'key': 'Gender', 'value': ['Men']},
 {'key': 'Shoe Size', 'value': ['M']},
 {'key': 'Shoe Category', 'value': ["Men's Shoes"]},
 {'key': 'Color', 'value': ['Multicolor']},
 {'key': 'Manufacturer Part Number', 'value': ['8190-W-NAVY-7.5']},
 {'key': 'Brand', 'value': ['Josmo']}]

In [0]:
test = {'key': 'value'}
test['key']
str(test)

"{'key': 'value'}"

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"', '"') )
  for item in features:
   
    key = item['key'].lower().strip()
    value = item['value'] [0].lower().strip()

    output_dict[key] = value
  

  return output_dict


df['features_parsed'] = df['features'].map(parse_features)

In [0]:
keys = set()
df['features_parsed'].map( lambda x: keys.update(x.keys()) )
len(keys)

476

In [0]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)


HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [0]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_era', 'feat_inv', 'feat_ring style',
       'feat_country of origin components:', 'feat_auto', 'feat_brand_cat',
       'feat_color_cat', 'feat_gender_cat',
       'feat_manufacturer part number_cat', 'feat_material_cat'],
      dtype='object', length=531)

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0] / df.shape[0] *100

In [0]:
{k:v for k,v in keys_stat.items() if v > 30}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]
  

In [0]:
df['brand'] = df['brand'].map(lambda x: str(x).lower() )
df[ df.brand == df.feat_brand ].shape

(8846, 1002)

In [0]:
feats = ['']

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat'], model)

(-57.32618304614882, 4.162376020850052)

In [0]:
feats_cat = [x for x in df.columns if 'cat' in x]
feats_cat

['categories',
 'brand_cat',
 'feat_recommended location',
 'feat_fabrication',
 'feat_catalog',
 'feat_shoe category',
 'feat_certifications and listings',
 'feat_clothing category',
 'feat_multi pack indicator',
 'feat_location - city/state',
 'feat_location - country',
 'feat_brand_cat',
 'feat_color_cat',
 'feat_gender_cat',
 'feat_manufacturer part number_cat',
 'feat_material_cat',
 'feat_sport_cat',
 'feat_style_cat',
 'feat_safety feature_cat',
 'feat_sleeve style_cat',
 'feat_video game platform_cat',
 'feat_bed size_cat',
 'feat_is waterproof_cat',
 'feat_clothing type_cat',
 'feat_light transmission (vlt)_cat',
 'feat_batteries required?_cat',
 'feat_mpn#_cat',
 'feat_age group_cat',
 'feat_applicable_cat',
 'feat_season_cat',
 'feat_number of items_cat',
 'feat_dial color_cat',
 'feat_number of compartments_cat',
 'feat_fabric care_cat',
 'feat_lens technology_cat',
 'feat_lens socket width_cat',
 'feat_material detail-1_cat',
 'feat_accessory type_cat',
 'feat_removable li

In [0]:
df['weight'].unique()

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.

In [0]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_metal type_cat', 'feat_shape_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_style_cat', 'feat_sport_cat'  ]



model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(feats, model)

In [0]:
x = df[ feats ].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(x, y)

print(result)
perm = PermutationImportance(m, random_state =1).fit(x, y);
eli5.show_weights(perm, feature_names=feats)

(-57.19190872876932, 4.184551207644387)


Weight,Feature
0.2562  ± 0.0077,brand_cat
0.1035  ± 0.0075,feat_material_cat
0.0253  ± 0.0026,feat_gender_cat
0.0178  ± 0.0008,feat_brand_cat
0.0126  ± 0.0006,feat_shape_cat
0.0089  ± 0.0012,feat_metal type_cat
0.0031  ± 0.0011,feat_style_cat
0.0002  ± 0.0000,feat_sport_cat


In [0]:
df[ df['brand'] == 'nike'].features_parsed.sample(5).values

array([{'sport': 'soccer', 'main color': 'royal blue/white-bright blue', 'material': 'synthetic leather', 'gender': 'mens', 'country/region of manufacture': 'vietnam', 'country of manufacture': 'vietnam', 'type': 'cleats', 'condition': 'new with box'},
       {'type': 'cleats', 'condition': 'new with box'},
       {'sport': 'football', 'main color': 'purple & green', 'type': 'cleats'},
       {'gender': 'men', 'shoe size': '10', 'shoe category': "men's shoes", 'color': 'photo blue/laser orange-deep royal blue', 'brand': 'nike'},
       {'style': 'running, cross training', 'condition': 'new without box'}],
      dtype=object)

In [0]:
df['feat_age group'].value_counts()

adult               4563
men                  350
child                 77
men's                 33
unisex                 6
infant                 4
toddler                4
mens                   4
boys'                  3
women ,�� unisex       2
women                  2
youth                  2
men||women             2
adult ,�� teen         1
12 up                  1
Name: feat_age group, dtype: int64

In [0]:
# SPOSÓB NA WYSZUKIWANIE MOICH PLIKÓW i WRZUCANIE ICH DO GITHUB !

In [386]:
HOME= ROOT + '/My Drive/Colab Notebooks'
HOME

'/content/drive/My Drive/Colab Notebooks'

In [387]:
GITHUB_TOKEN = '40071ce1a694987a2ea3e7be1d8ce624122b4449'
GITHUB_URL = 'https://{0}@github.com/halfron78/dw_matrix.git'.format(GITHUB_TOKEN)
GITHUB_URL

'https://40071ce1a694987a2ea3e7be1d8ce624122b4449@github.com/halfron78/dw_matrix.git'

In [388]:
!git clone {GITHUB_URL}

Cloning into 'dw_matrix'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects:   5% (1/20)[Kremote: Counting objects:  10% (2/20)[Kremote: Counting objects:  15% (3/20)[Kremote: Counting objects:  20% (4/20)[Kremote: Counting objects:  25% (5/20)[Kremote: Counting objects:  30% (6/20)[Kremote: Counting objects:  35% (7/20)[Kremote: Counting objects:  40% (8/20)[Kremote: Counting objects:  45% (9/20)[Kremote: Counting objects:  50% (10/20)[Kremote: Counting objects:  55% (11/20)[Kremote: Counting objects:  60% (12/20)[Kremote: Counting objects:  65% (13/20)[Kremote: Counting objects:  70% (14/20)[Kremote: Counting objects:  75% (15/20)[Kremote: Counting objects:  80% (16/20)[Kremote: Counting objects:  85% (17/20)[Kremote: Counting objects:  90% (18/20)[Kremote: Counting objects:  95% (19/20)[Kremote: Counting objects: 100% (20/20)[Kremote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects:   6% (1/16)[Kremote

In [389]:
cd dw_matrix

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [390]:
ls

HelloGithub.ipynb  LICENSE  [0m[01;34mmatrix_one[0m/  README.md


In [391]:
!git add day5.ipynb

fatal: pathspec 'day5.ipynb' did not match any files


In [378]:
!git commit -m "add day5.ipynb file to GITHUB"

fatal: not a git repository (or any parent up to mount point /content)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


In [0]:
!git config --global user.email "t.laciak78@gmail.com"
!git config --global user.name "halfron78"

In [339]:
# !git clone 'https://github.com/halfron78/dw_matrix.git'
# Username: 'halfron78'
# Password: '40071ce1a694987a2ea3e7be1d8ce624122b4449'

fatal: destination path 'dw_matrix' already exists and is not an empty directory.


In [353]:
!git push -u origin master

fatal: could not read Password for 'https://e829c351f69bcc8f59edafe085daabd8987a169c@github.com': No such device or address
