In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
import dask.dataframe as dd
import pandas as pd
import numpy as np
import time
import re
from sklearn.externals import joblib
seed = 101


In [2]:


from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=10, threads_per_worker=20, dashboard_address=':0')
client = Client(cluster)
client



0,1
Client  Scheduler: tcp://127.0.0.1:37353  Dashboard: /user/lemonnn-8-dask-examples-gwn7xarg/proxy/44375/status,Cluster  Workers: 10  Cores: 200  Memory: 54.88 GB


In [3]:
def tokenizer(text):
    if text:
        result = re.findall('[a-z]{2,}', text.lower())
    else:
        result = []
    return result

In [4]:
df = dd.read_csv('train.tsv', sep='\t')
df = df.compute()

In [5]:
df['item_description'].fillna(value='Missing', inplace=True)
X = (df['name'] + ' ' + df['item_description']).values
y = np.log1p(df['price'].values)
with joblib.parallel_backend('dask'):
        X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.3, random_state=seed)


In [6]:
with joblib.parallel_backend('dask'):
    vect = TfidfVectorizer(tokenizer=tokenizer, stop_words='english')
    start = time.time()
    X_train_vect = vect.fit_transform(X_train)
    end = time.time()
    print('Time to train vectorizer and transform training text: %0.2fs' % (end - start))

Time to train vectorizer and transform training text: 0.03s


In [19]:
#from dask_ml.model_selection import GridSearchCV
model = SGDRegressor(loss='squared_loss', penalty='none', random_state=seed, max_iter=500000, tol=1e-6)
params = {'penalty':['none','l2','l1'],
          'alpha':[1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 0.1]}
gs = GridSearchCV(estimator=model,
                  param_grid=params,
                  scoring='neg_mean_squared_error',
                  cv=5,
                  n_jobs=-1, verbose = 3)
start = time.time()
gs.fit(X_train_vect, y_train)
end = time.time()
print('Time to train model: %0.2fs' % (end -start))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] alpha=0.0001, penalty=none ......................................


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  alpha=0.0001, penalty=none, score=-0.7243991267042661, total=   2.1s
[CV] alpha=0.0001, penalty=none ......................................


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s


[CV]  alpha=0.0001, penalty=none, score=-0.6597777867822803, total=   1.9s
[CV] alpha=0.0001, penalty=none ......................................


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.0s remaining:    0.0s


[CV]  alpha=0.0001, penalty=none, score=-0.5866548677506939, total=   2.0s
[CV] alpha=0.0001, penalty=none ......................................
[CV]  alpha=0.0001, penalty=none, score=-0.8116720767814963, total=   1.8s
[CV] alpha=0.0001, penalty=none ......................................
[CV]  alpha=0.0001, penalty=none, score=-0.8500709602066103, total=   1.8s
[CV] alpha=0.0001, penalty=l2 ........................................
[CV]  alpha=0.0001, penalty=l2, score=-0.685417377424021, total=   2.1s
[CV] alpha=0.0001, penalty=l2 ........................................
[CV]  alpha=0.0001, penalty=l2, score=-0.6235156975796732, total=   2.0s
[CV] alpha=0.0001, penalty=l2 ........................................
[CV]  alpha=0.0001, penalty=l2, score=-0.560044766818491, total=   2.0s
[CV] alpha=0.0001, penalty=l2 ........................................
[CV]  alpha=0.0001, penalty=l2, score=-0.7723449458165883, total=   2.0s
[CV] alpha=0.0001, penalty=l2 .............................

[CV]  alpha=0.001, penalty=l1, score=-0.6505940910531204, total=   1.7s
[CV] alpha=0.002, penalty=none .......................................
[CV]  alpha=0.002, penalty=none, score=-0.7243991267042661, total=   1.9s
[CV] alpha=0.002, penalty=none .......................................
[CV]  alpha=0.002, penalty=none, score=-0.6597777867822803, total=   1.8s
[CV] alpha=0.002, penalty=none .......................................
[CV]  alpha=0.002, penalty=none, score=-0.5866548677506939, total=   2.0s
[CV] alpha=0.002, penalty=none .......................................
[CV]  alpha=0.002, penalty=none, score=-0.8116720767814963, total=   2.0s
[CV] alpha=0.002, penalty=none .......................................
[CV]  alpha=0.002, penalty=none, score=-0.8500709602066103, total=   1.9s
[CV] alpha=0.002, penalty=l2 .........................................
[CV]  alpha=0.002, penalty=l2, score=-0.5331180019098414, total=   1.2s
[CV] alpha=0.002, penalty=l2 ...............................

[CV]  alpha=0.02, penalty=l1, score=-0.5971502344049845, total=   0.6s
[CV] alpha=0.02, penalty=l1 ..........................................
[CV]  alpha=0.02, penalty=l1, score=-0.6092398610572116, total=   0.7s
[CV] alpha=0.02, penalty=l1 ..........................................
[CV]  alpha=0.02, penalty=l1, score=-0.6514737094365116, total=   0.5s
[CV] alpha=0.05, penalty=none ........................................
[CV]  alpha=0.05, penalty=none, score=-0.7243991267042661, total=   2.0s
[CV] alpha=0.05, penalty=none ........................................
[CV]  alpha=0.05, penalty=none, score=-0.6597777867822803, total=   1.9s
[CV] alpha=0.05, penalty=none ........................................
[CV]  alpha=0.05, penalty=none, score=-0.5866548677506939, total=   1.9s
[CV] alpha=0.05, penalty=none ........................................
[CV]  alpha=0.05, penalty=none, score=-0.8116720767814963, total=   1.9s
[CV] alpha=0.05, penalty=none .......................................

[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.8min finished


Time to train model: 228.71s


In [20]:
with joblib.parallel_backend('dask'):
    model = gs.best_estimator_
print(gs.best_params_)
print(gs.best_score_)

{'alpha': 0.002, 'penalty': 'l2'}
-0.5368059524741735


In [21]:
with joblib.parallel_backend('dask'):
    pipe = Pipeline([('vect',vect),('model',model)])
    start = time.time()
    y_pred = pipe.predict(X_test)
end = time.time()
print('Time to generate predictions on test set: %0.2fs' % (end - start))

Time to generate predictions on test set: 0.01s


In [22]:
with joblib.parallel_backend('dask'):
    print(np.sqrt(mean_squared_log_error(np.exp(y_test)-1, np.exp(y_pred)-1)))

0.7132962188193676


In [23]:


df_test = dd.read_csv('test.tsv', sep='\t')
df_test=df_test.compute()



In [25]:


df_test['item_description'].fillna('Missing', inplace=True)
df_test['price'] = np.exp(pipe.predict((df_test['name'] + ' ' + df_test['item_description']).values))-1
df_test




Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description,price
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7,16.986330
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined...",12.767607
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...,18.752340
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...,14.054427
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...,14.676350
5,5,iPhone 6 Plus or 6s Plus Vodka pink case,1,"Electronics/Cell Phones & Accessories/Cases, C...",,1,One Absolut Vodka in Pink for iPhone 6 Plus an...,13.841336
6,6,Vintage Cameo Pendant & Brooch Pin,3,Women/Jewelry/Necklaces,Vintage,1,Two vintage Cameo pieces. 1. Silver metal Lock...,12.811613
7,7,Rose Gold Stainless Steel Quartz Watch,1,Women/Women's Accessories/Watches,,1,Brand new Price firm No trades Box included wi...,27.403316
8,8,Daisy Marc Jacobs 3.4oz,3,Beauty/Fragrance/Women,MARC JACOBS,0,Brand new No box 100% authentic Firm price NO ...,54.617207
9,9,Rose Brushes and Silicone Sponge,1,Beauty/Tools & Accessories/Makeup Brushes & Tools,,1,All new. 12 pcs makeup brushes and one Silicon...,13.398611
