In [1]:
# Import modules
import pandas as pd
import numpy as np
import pickle
import json

In [2]:
# Import sklearn modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin

In [3]:
from xgboost import XGBClassifier

In [4]:
# Define categories and category feature columns
categories = ['beauty', 'fashion', 'mobile']
category_feature_columns = {'beauty':[ 'Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type'], 
                   'fashion': ['Collar Type', 'Sleeves', 'Pattern', 'Fashion Trend', 'Clothing Material'],
                   'mobile': ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']}

# Define all utility functions here

In [5]:
def train_test_data(df, label, test_size):
    '''Prepare training and test data'''
    df = df[['title', label]]
    df = df.dropna()
    X = df['title']
    y = df[label]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
    return X_train,X_test,y_train,y_test

# Train SGD model

In [6]:
def build_SGD_model():
    """
    Build the SGD model
    """ 

    # Build pipeline
    text_clf = Pipeline([('vect', CountVectorizer(token_pattern='\\b\\w+\\b')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, class_weight='balanced'))])


    # Define parameters for grid search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                'tfidf__use_idf': (True, False),
                'clf__alpha': (1e-1, 1e-5)}


    # Grid search across our parameters, scoring by accuracy
    gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, cv=ShuffleSplit(test_size=0.01, n_splits=1), verbose = 10, iid=False, n_jobs=-1, scoring='accuracy')

    return gs_clf

In [7]:

### Train the SGD model
def train_test_SGD_model(category, column, X_train, y_train):
    SGD_model = build_SGD_model()
    SGD_model.fit(X_train, y_train)
    # Evaluation
    best_params = SGD_model.best_params_
    predicted = SGD_model.predict(X_test)
    print('Best Parameters: ', best_params)
    print('Train Set Accuracy: ', round(SGD_model.best_score_, 2))
    print('Test Set Accuracy: ', round(np.mean(predicted == y_test), 2))
    
    with open ('SGD_clf_21032019_{}_{}.pkl'.format(category, column), 'wb') as f:
        pickle.dump(SGD_model, f)
        
    return SGD_model
#     print('Train set Accuracy (cross val): ', cross_val_score(SGD_model, X_train, y_train, cv=4, scoring='accuracy'))
#     print(classification_report(y_test, predicted))
#     print(confusion_matrix(y_test, predicted))
#     with open ('SGD_clf_28022019_{}.pkl'.format(label), 'wb') as f:
#         pickle.dump(SGD_model, f)

In [19]:
mobile_feature_columns = ['Operating System']

df_mobile_train = pd.read_csv('../data/mobile_data_info_train_competition.csv')



for column in mobile_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_mobile_train, column, 0.2)
    SGD_model = train_test_SGD_model('mobile', column, X_train, y_train)


Now processing for column: Operating System
147230                                     charger iphone 5
18850                                         vivo y81 3 16
32672                              iphone x show 4g ram 2gb
49917     pc lenovo ideacentre 510 core i5 2tb hdd win10...
4482      zuk z1 snapdragon ram 3gb rom 64gb 4100mah len...
53172                                google pixel 4gb 128gb
73582                                              oppo a83
60822                               oneplus 3 6gb 64gb grey
118163                                         coolpad e502
100109                                         redmi 6a tam
137001                                     advan tablet i10
32194                                           lg l70 sale
38322              asus zenfone go zb500kg 1 8 gb 5 inch 3g
121357                                          sony z3 3gb
56060           terbaik charger iphone usb light 6 s plus 5
34900                      goyang shopee iphone 6 16gb g



ValueError: could not convert string to float: 'wa 083136007555 beli 2 bonus 1 iphone 6 plus 64gb space gray'

In [17]:
def build_SGD_model1():
    """
    Build the SGD model
    """ 

    # Build pipeline
    text_clf = Pipeline([('extract_title', FunctionTransformer(get_title)),
                         ('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, class_weight='balanced'))])


    # Define parameters for grid search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                'tfidf__use_idf': (True, False),
                'clf__alpha': (1e-1, 1e-5)}


    # Grid search across our parameters, scoring by accuracy
    gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, cv=5, iid=False, n_jobs=-1, scoring='accuracy')

    return gs_clf

In [95]:
def get_title(df):
    return df['title']

def get_non_title(df, feature_columns):
    return df[feature_columns]

In [96]:
text_clf = Pipeline([('extract_title', FunctionTransformer((lambda x: get_non_title(x, feature_columns = ['Operating System'])), validate = False)),
                         ('vect', CountVectorizer())])

X = text_clf.fit_transform(df_mobile_train).toarray()

In [104]:
df[['title']].values

array([['apple iphone 4s back glass spare part original replacement putih'],
       ['iphone 4s 64gb white'],
       ['samsung sm b310e piton dual sim'],
       ['samsung caramel gt e1272 dual sim 32 mb putih'],
       ['garskin sony experia z z1 z2 ultra'],
       ['lcd xiaomi redmi 4+touchscreen'],
       ['samsung caramel gt e1272 dual sim 32mb black'],
       ['iphone 4g 8gb'],
       ['blackberry torch 1 9800 gsm garansi distributor 2 tahun white'],
       ['samsung keystone 3 sm b109e'],
       ['samsung galaxy j5 j 500g 8 gb hitam'],
       ['samsung galaxy j1 mini sm j105 8gb white'],
       ['iphone 5 white 16gb fullset mulus'],
       ['lenovo a 6000 se 1 16 white'],
       ['samsung galaxy j1'],
       ['keypad blackberry 9360 hitam'],
       ['xiaomi redmi 3pro 3s'],
       ['iphone 6s 64gb'],
       ['iphone 5 garansi 1 thn platinum'],
       ['iphone 5s 32gb'],
       ['iphone 4s 16gb black baru garansi 1 tahun'],
       ['nokia 230 garansi resmi 1 tahun new'],
       ['s

In [61]:
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [66]:
text_clf = Pipeline([('extract_title', FunctionTransformer(get_title, validate = False)),
                         ('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer())])
text_clf2 = Pipeline([('extract_non_title', FunctionTransformer(get_non_title, validate = False))])

union = FeatureUnion(transformer_list = [("pp1", text_clf),("pp2", text_clf2)])
text_clf.fit_transform(df_mobile_train)

<1x1 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

## Train SGD for beauty 

In [12]:
beauty_feature_columns = [ 'Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type']

df_beauty_train = pd.read_csv('../data/processed/beauty_train_processed.csv')
df_beauty_train = df_beauty_train[df_beauty_train['language']=='id']

for column in beauty_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_beauty_train, column, 0.2)
    SGD_model = train_test_SGD_model('beauty', column, X_train, y_train)

Now processing for column: Brand
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    8.3s remaining:   24.9s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:   11.1s remaining:   18.5s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:   11.2s remaining:   11.2s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   12.3s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:   12.3s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   15.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   15.3s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.98
Test Set Accuracy:  0.97
Now processing for column: Colour_group
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.6s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.9s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    0.9s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.0s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    1.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.2s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.92
Test Set Accuracy:  0.88
Now processing for column: Benefits
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.7s remaining:    5.2s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.8s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    1.9s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    2.3s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    2.4s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.6s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.88
Test Set Accuracy:  0.87
Now processing for column: Product_texture
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.3s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.4s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    1.5s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    2.1s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    2.1s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.2s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.96
Test Set Accuracy:  0.97
Now processing for column: Skin_type
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.4s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    0.6s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    0.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.7s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.87
Test Set Accuracy:  0.86


In [11]:
df_beauty_train = pd.read_csv('../data/processed/beauty_train_processed.csv')
df_beauty_train.shape

(286583, 12)

## Train SGD for mobile

In [15]:
mobile_feature_columns = ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']

df_mobile_train = pd.read_csv('../data/processed/mobile_train_processed.csv')

df_mobile_train = df_mobile_train[df_mobile_train['language']=='id']

for column in mobile_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_mobile_train, column, 0.2)
    SGD_model = train_test_SGD_model('mobile', column, X_train, y_train)

Now processing for column: Operating System
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    2.1s remaining:    6.4s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.1s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    2.2s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    2.4s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    2.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.5s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.99
Test Set Accuracy:  0.97
Now processing for column: Features
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.4s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.6s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    0.6s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    0.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.8s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.76
Test Set Accuracy:  0.74
Now processing for column: Network Connections
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1906s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.2s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    0.3s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.5s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.92
Test Set Accuracy:  0.9
Now processing for column: Memory RAM
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.8s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.9s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    1.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.3s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.92
Test Set Accuracy:  0.9
Now processing for column: Brand
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    4.3s remaining:   12.8s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    5.4s remaining:    9.0s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    5.4s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    6.2s remaining:    3.7s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    6.5s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    7.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    7.6s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.99
Test Set Accuracy:  0.98
Now processing for column: Warranty Period
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.1s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.4s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    1.4s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.7s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    1.7s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.7s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.93
Test Set Accuracy:  0.92
Now processing for column: Storage Capacity
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.8s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.9s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    0.9s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    1.2s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.3s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.96
Test Set Accuracy:  0.95
Now processing for column: Color Family
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.4s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.8s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    1.8s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    2.2s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    2.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.3s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.81
Test Set Accuracy:  0.82
Now processing for column: Phone Model
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   11.6s remaining:   34.9s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:   15.2s remaining:   25.3s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:   15.5s remaining:   15.5s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   16.2s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:   17.3s remaining:    5.8s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   22.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   22.0s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.97
Test Set Accuracy:  0.97
Now processing for column: Camera
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.4s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    0.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.6s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.6s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.8
Test Set Accuracy:  0.68
Now processing for column: Phone Screen Size
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.3s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    0.6s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    0.6s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.7s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.75
Test Set Accuracy:  0.77


In [21]:
mobile_feature_columns = ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']

df_mobile_train = pd.read_csv('../data/mobile_data_info_train_competition.csv')

for column in mobile_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_mobile_train, column, 0.2)
    SGD_model = train_test_SGD_model('mobile', column, X_train, y_train)

Now processing for column: Operating System
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    3.7s remaining:   11.1s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    3.9s remaining:    6.5s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    4.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    5.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    5.0s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.1s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.98
Test Set Accuracy:  0.97
Now processing for column: Features
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.8s remaining:    5.3s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.7s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    2.8s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    3.1s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    3.1s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.5s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.78
Test Set Accuracy:  0.78
Now processing for column: Network Connections
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    0.8s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.1s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    1.2s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.4s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    1.4s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.5s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.97
Test Set Accuracy:  0.91
Now processing for column: Memory RAM
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    2.9s remaining:    8.8s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    3.8s remaining:    6.3s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    4.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    4.8s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    4.8s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.1s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.92
Test Set Accuracy:  0.91
Now processing for column: Brand
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:   19.5s remaining:   58.6s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:   25.4s remaining:   42.4s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:   25.4s remaining:   25.4s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   27.7s remaining:   16.6s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:   27.8s remaining:    9.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   33.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   33.3s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.99
Test Set Accuracy:  0.99
Now processing for column: Warranty Period
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    3.1s remaining:    9.4s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    5.6s remaining:    9.3s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    5.7s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    5.8s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    5.8s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    6.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    6.9s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.81
Test Set Accuracy:  0.8
Now processing for column: Storage Capacity
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    2.9s remaining:    8.8s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    3.6s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    3.6s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    4.5s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    4.6s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.9s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.97
Test Set Accuracy:  0.95
Now processing for column: Color Family
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    7.5s remaining:   22.5s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    9.5s remaining:   15.8s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    9.9s remaining:    9.9s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:   11.3s remaining:    6.8s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:   11.4s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   12.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   12.4s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.85
Test Set Accuracy:  0.83
Now processing for column: Phone Model
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  1.5min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:  2.0min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:  2.0min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:  2.1min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:  2.2min remaining:   43.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  2.7min finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.98
Test Set Accuracy:  0.96
Now processing for column: Camera
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.8s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.7s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    2.8s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    2.8s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    2.8s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    3.2s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.77
Test Set Accuracy:  0.7
Now processing for column: Phone Screen Size
Fitting 1 folds for each of 8 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.3s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.3s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    2.4s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    2.5s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    2.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.9s finished


Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.79
Test Set Accuracy:  0.78


## Train SGD for fashion

In [11]:
fashion_feature_columns = ['Collar Type', 'Sleeves', 'Pattern', 'Fashion Trend', 'Clothing Material']


df_fashion_train = pd.read_csv('../data/fashion_data_info_train_competition.csv')

for column in fashion_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_fashion_train, column, 0.2)
    SGD_model = train_test_SGD_model('fashion', column, X_train, y_train)

Now processing for column: Collar Type
Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.94
Test Set Accuracy:  0.95
Now processing for column: Sleeves
Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.99
Test Set Accuracy:  0.99
Now processing for column: Pattern
Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.93
Test Set Accuracy:  0.94
Now processing for column: Fashion Trend
Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.92
Test Set Accuracy:  0.92
Now processing for column: Clothing Material




Best Parameters:  {'clf__alpha': 1e-05, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.86
Test Set Accuracy:  0.86


# Using SGD model to predict testing dataset

In [175]:
df_fashion_val = pd.read_csv('../data/fashion_data_info_val_competition.csv')
X_test = df_fashion_val['title']

for column in fashion_feature_columns:
    with open('SGD_clf_02032019_fashion_{}.pkl'.format(column), 'rb') as f:
        SGD_clf = pickle.load(f)
    predicted = SGD_clf.predict(X_test)
    df_fashion_val[column] = predicted
    

In [92]:
df_mobile_val = pd.read_csv('../data/mobile_data_info_val_competition.csv')
X_test = df_mobile_val['title']

for column in mobile_feature_columns:
    with open('SGD_clf_02032019_mobile_{}.pkl'.format(column), 'rb') as f:
        SGD_clf = pickle.load(f)
    predicted = SGD_clf.predict(X_test)
    df_mobile_val[column] = predicted

In [176]:
df_fashion_val.to_csv('fashion_result_0303.csv')

In [100]:
### Open the json file

with open('../data/beauty_profile_train.json') as f:
    beauty_json = json.load(f)
with open('../data/fashion_profile_train.json') as f:
    fashion_json = json.load(f)
with open('../data/mobile_profile_train.json') as f:
    mobile_json = json.load(f)

In [101]:
beauty_column_map = {}
for column in beauty_feature_columns:
    beauty_column_map[column] = {v:k for k,v in beauty_json[column].items()}

fashion_column_map = {}
for column in fashion_feature_columns:
    fashion_column_map[column] = {v:k for k,v in fashion_json[column].items()}

mobile_column_map = {}
for column in mobile_feature_columns:
    mobile_column_map[column] = {v:k for k,v in mobile_json[column].items()}

In [103]:
df_beauty_val_text = df_beauty_val.copy()
for column in beauty_feature_columns:
    df_beauty_val_text.loc[:, column] = df_beauty_val_text[column].map(beauty_column_map[column])
    
df_fashion_val_text = df_fashion_val.copy()
for column in fashion_feature_columns:
    df_fashion_val_text.loc[:, column] = df_fashion_val_text[column].map(fashion_column_map[column])

df_mobile_val_text = df_mobile_val.copy()
for column in mobile_feature_columns:
    df_mobile_val_text.loc[:, column] = df_mobile_val_text[column].map(mobile_column_map[column])

In [114]:
df_fashion_val_text

Unnamed: 0,itemid,title,image_path,Collar Type,Sleeves,Pattern,Fashion Trend,Clothing Material
0,670968580,women casual o neck long sleeve lace patchwork...,fashion_image/b0ca9645c5c3527882d935f17b4c9579...,o neck,long sleeve,patchwork,sexy,lace
1,475599610,zaful long dress vintage wanita v neck lengan ...,fashion_image/c74af2636e9fe2686981decce3ca5b6c...,v neck,long sleeve,print,retro vintage,chiffon
2,1652671018,dress skater swing o neck tanpa lengan desain ...,fashion_image/1f2a2d91f5eb2d80f1dfdcf0b1f50169...,o neck,sleeveless,patchwork,retro vintage,lace
3,1705634904,ok dress slim sexy deep v neck lengan panjang ...,fashion_image/fdb6309635ebb5eac5df9c61984ffce5...,v neck,long sleeve,plain,sexy,lace
4,1789421087,noa korean lace embroidery womens crew neck lo...,fashion_image/578e5c74e97128cd25378ef0f0c021cf...,o neck,long sleeve,embroidery,korean,lace
5,1818944472,vintage women christmas lace patchwork o neck ...,fashion_image/dde2d4b3626a6924ea612d32315a5797...,o neck,sleeveless,patchwork,retro vintage,lace
6,1341150340,ready stock dress v neck tanpa lengan desain p...,fashion_image/d951fbeb548a5a5a2a307b56aa791bfa...,v neck,sleeveless,patchwork,party,lace
7,1021592916,women patchwork long sleeve lace o neck beach ...,fashion_image/4a5fe5c2019b4e466e843dbbe011ef4b...,o neck,long sleeve,patchwork,party,lace
8,1252778162,fashion wanita dress bodycon bandage pensil la...,fashion_image/aefb671d09d8ad9004c37103d0f9e4a3...,v neck,sleeveless,plain,sexy,lace
9,1309386587,fashion bohemian women lady dress chiffon chro...,fashion_image/52a94a1c04a1f8b6b30f737521d20fdc...,v neck,sleeveless,stripe,bohemian,chiffon


# Training DecisionTree (DT)

In [17]:
def build_DT_model():
    """
    Build the Decision Tree model
    """ 


    # Build pipeline
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', DecisionTreeClassifier(random_state=0))])


    # Define parameters for grid search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                'tfidf__use_idf': (True, False),
                 'clf__max_depth':(1,2,3,4,5),
                 }


    # Grid search across our parameters, scoring by accuracy
    gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, cv=5, iid=False, n_jobs=-1, scoring='accuracy')

    return gs_clf

In [22]:
def train_test_DT_model(category, column, X_train, y_train):
    DT_model = build_DT_model()
    DT_model.fit(X_train, y_train)
    # Evaluation
    best_params = DT_model.best_params_
    predicted = DT_model.predict(X_test)
    
    # Print the score
    best_params = best_params
    print('Best Parameters: ', best_params)
    train_acc = round(DT_model.best_score_, 2)
    print('Train Set Accuracy: ', train_acc)
    test_acc = round(np.mean(predicted == y_test), 2)
    print('Test Set Accuracy: ', test_acc)
    
    DT_train['result'][column] = {}
    DT_train['result'][column]['params'] = best_params
    DT_train['result'][column]['train_acc'] = train_acc
    DT_train['result'][column]['test_acc'] = test_acc
    
    with open ('DT_clf_07032019_DT_{}_{}.pkl'.format(category, column), 'wb') as f:
        pickle.dump(DT_model, f)
        
    return DT_model

In [21]:
##### Record the json training file for DT

DT_train = {}
DT_train['date'] = '07-03-2019'
DT_train['model'] = 'DecisionTree'
DT_train['model_details'] = {'split': 0.2, 'remarks':'None'}
DT_train['result'] = {}


In [23]:
fashion_feature_columns = ['Collar Type', 'Sleeves', 'Pattern', 'Fashion Trend', 'Clothing Material']


df_fashion_train = pd.read_csv('../data/fashion_data_info_train_competition.csv')

for column in fashion_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_fashion_train, column, 0.2)
    DT_model = train_test_DT_model('fashion', column, X_train, y_train)

Now processing for column: Collar Type
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.71
Test Set Accuracy:  0.71
Now processing for column: Sleeves
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.97
Test Set Accuracy:  0.97
Now processing for column: Pattern
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.76
Test Set Accuracy:  0.76
Now processing for column: Fashion Trend
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.79
Test Set Accuracy:  0.79
Now processing for column: Clothing Material
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.63
Test Set Accuracy:  0.64


In [27]:
DT_train

{'date': '07-03-2019',
 'model': 'DecisionTree',
 'model_details': {'split': 0.2, 'remarks': 'None'},
 'result': {'Collar Type': {'params': {'clf__max_depth': 5,
    'tfidf__use_idf': True,
    'vect__ngram_range': (1, 2)},
   'train_acc': 0.71,
   'test_acc': 0.71},
  'Sleeves': {'params': {'clf__max_depth': 5,
    'tfidf__use_idf': True,
    'vect__ngram_range': (1, 1)},
   'train_acc': 0.97,
   'test_acc': 0.97},
  'Pattern': {'params': {'clf__max_depth': 5,
    'tfidf__use_idf': True,
    'vect__ngram_range': (1, 1)},
   'train_acc': 0.76,
   'test_acc': 0.76},
  'Fashion Trend': {'params': {'clf__max_depth': 5,
    'tfidf__use_idf': True,
    'vect__ngram_range': (1, 1)},
   'train_acc': 0.79,
   'test_acc': 0.79},
  'Clothing Material': {'params': {'clf__max_depth': 5,
    'tfidf__use_idf': False,
    'vect__ngram_range': (1, 1)},
   'train_acc': 0.63,
   'test_acc': 0.64},
  'Brand': {'params': {'clf__max_depth': 5,
    'tfidf__use_idf': True,
    'vect__ngram_range': (1, 1)},
 

In [26]:
beauty_feature_columns = [ 'Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type']

df_beauty_train = pd.read_csv('../data/beauty_data_info_train_competition.csv')


for column in beauty_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_beauty_train, column, 0.2)
    DT_model = train_test_DT_model('beauty', column, X_train, y_train)

Now processing for column: Brand




Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.29
Test Set Accuracy:  0.29
Now processing for column: Colour_group
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.67
Test Set Accuracy:  0.68
Now processing for column: Benefits




Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.55
Test Set Accuracy:  0.55
Now processing for column: Product_texture
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.88
Test Set Accuracy:  0.88
Now processing for column: Skin_type
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.48
Test Set Accuracy:  0.49


In [29]:
mobile_feature_columns = ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']

df_mobile_train = pd.read_csv('../data/mobile_data_info_train_competition.csv')



for column in mobile_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_mobile_train, column, 0.2)
    SGD_model = train_test_DT_model('mobile', column, X_train, y_train)

Now processing for column: Operating System




Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.94
Test Set Accuracy:  0.94
Now processing for column: Features
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.49
Test Set Accuracy:  0.49
Now processing for column: Network Connections
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.83
Test Set Accuracy:  0.82
Now processing for column: Memory RAM
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.63
Test Set Accuracy:  0.63
Now processing for column: Brand




Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.74
Test Set Accuracy:  0.74
Now processing for column: Warranty Period
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.81
Test Set Accuracy:  0.81
Now processing for column: Storage Capacity
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.86
Test Set Accuracy:  0.86
Now processing for column: Color Family




Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.67
Test Set Accuracy:  0.68
Now processing for column: Phone Model




Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}
Train Set Accuracy:  0.21
Test Set Accuracy:  0.21
Now processing for column: Camera




Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.32
Test Set Accuracy:  0.32
Now processing for column: Phone Screen Size
Best Parameters:  {'clf__max_depth': 5, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Train Set Accuracy:  0.4
Test Set Accuracy:  0.4


In [31]:
# Save the result
with open('DT_0703.json', 'w') as outfile:  
    json.dump(DT_train, outfile)

In [116]:
df_fashion_leak = pd.read_csv('../data/fashion_data_info_val_submission.csv')

In [118]:
df_fashion_leak.rename(columns={"Clothing Material": "Clothing Material True", "Pattern": "Pattern True",
                              "Sleeves":"Sleeves True", "Collar Type":"Collar Type True", "Fashion Trend": "Fashion Trend True" })

Unnamed: 0,Id,Clothing Material True,Pattern True,Sleeves True,Collar Type True,Fashion Trend True
0,0,lace,patchwork,long sleeve,o neck,sexy
1,1,chiffon,print,long sleeve,v neck,retro vintage
2,2,lace,patchwork,sleeveless,o neck,retro vintage
3,3,lace,plain,long sleeve,v neck,sexy
4,4,lace,embroidery,long sleeve,o neck,korean
5,5,lace,patchwork,sleeveless,o neck,retro vintage
6,6,lace,patchwork,sleeveless,v neck,party
7,7,lace,patchwork,long sleeve,o neck,party
8,8,lace,plain,sleeveless,v neck,sexy
9,9,chiffon,stripe,sleeveless,o neck,bohemian


In [141]:
df_fashion_val_text

Unnamed: 0,itemid,title,image_path,Collar Type,Sleeves,Pattern,Fashion Trend,Clothing Material
0,670968580,women casual o neck long sleeve lace patchwork...,fashion_image/b0ca9645c5c3527882d935f17b4c9579...,o neck,long sleeve,patchwork,sexy,lace
1,475599610,zaful long dress vintage wanita v neck lengan ...,fashion_image/c74af2636e9fe2686981decce3ca5b6c...,v neck,long sleeve,print,retro vintage,chiffon
2,1652671018,dress skater swing o neck tanpa lengan desain ...,fashion_image/1f2a2d91f5eb2d80f1dfdcf0b1f50169...,o neck,sleeveless,patchwork,retro vintage,lace
3,1705634904,ok dress slim sexy deep v neck lengan panjang ...,fashion_image/fdb6309635ebb5eac5df9c61984ffce5...,v neck,long sleeve,plain,sexy,lace
4,1789421087,noa korean lace embroidery womens crew neck lo...,fashion_image/578e5c74e97128cd25378ef0f0c021cf...,o neck,long sleeve,embroidery,korean,lace
5,1818944472,vintage women christmas lace patchwork o neck ...,fashion_image/dde2d4b3626a6924ea612d32315a5797...,o neck,sleeveless,patchwork,retro vintage,lace
6,1341150340,ready stock dress v neck tanpa lengan desain p...,fashion_image/d951fbeb548a5a5a2a307b56aa791bfa...,v neck,sleeveless,patchwork,party,lace
7,1021592916,women patchwork long sleeve lace o neck beach ...,fashion_image/4a5fe5c2019b4e466e843dbbe011ef4b...,o neck,long sleeve,patchwork,party,lace
8,1252778162,fashion wanita dress bodycon bandage pensil la...,fashion_image/aefb671d09d8ad9004c37103d0f9e4a3...,v neck,sleeveless,plain,sexy,lace
9,1309386587,fashion bohemian women lady dress chiffon chro...,fashion_image/52a94a1c04a1f8b6b30f737521d20fdc...,v neck,sleeveless,stripe,bohemian,chiffon


In [122]:
fashion = pd.merge(df_fashion_val_text, df_fashion_leak, left_index=True, right_index=True)

In [126]:
fashion_result = {}

for column in fashion_feature_columns:
    fashion_result[column] = fashion[['{}_x'.format(column), '{}_y'.format(column)]]
    fashion_result[column] = fashion_result[column].loc[fashion_result[column]['{}_x'.format(column)]!=fashion_result[column]['{}_y'.format(column)]]

In [139]:
fashion_result['Clothing Material']

Unnamed: 0,Clothing Material_x,Clothing Material_y
19,polyester,rayon
485,cotton,
486,rayon,
489,cotton,
490,cotton,
491,nylon,polyester
492,cotton,
493,polyester,cotton
495,polyester,
499,cotton,


In [140]:
df_fashion_leak['Fashion Trend'].isna().sum()

32287

In [None]:
#### Evaluation

# Evaluation
best_params = SGD_model.best_params_
predicted = SGD_model.predict(X_test)
print('Best Parameters: ', best_params)
print('Train Set Accuracy: ', round(SGD_model.best_score_, 2))
print('Test Set Accuracy: ', round(np.mean(predicted == y_test), 2))
print('Train set Accuracy (cross val): ', cross_val_score(SGD_model, X_train, y_train, cv=5, scoring='accuracy'))
print(classification_report(y_test, predicted))
print(confusion_matrix(y_test, predicted))

In [147]:
df_mobile_val_text

Unnamed: 0,itemid,title,image_path,Operating System,Features,Network Connections,Memory RAM,Brand,Warranty Period,Storage Capacity,Color Family,Phone Model,Camera,Phone Screen Size
0,1520485457,new promo iphone 5s 16gb gold ex resmi ibox,mobile_image/876d4a1fe29e056855fa6f9643757b1c.jpg,ios,fingerprint sensor,4g,16gb,apple,3 months,32gb,gold,apple iphone 5s,dua slot,3.6 to 4 inches
1,1520516704,new promo xiaomi note 5a prime 3 32gb tam,mobile_image/e2b902b7cd35cd50f061d8e2a3ba7178.jpg,android,touchscreen,2g,3gb,xiaomi,1 year,32gb,gold,xiaomi note 5a prime,16mp,5.1 to 5.5 inches
2,1520726573,big produk super murah oppo f5 4gb 32gb fullset,mobile_image/785c03cb1a21a1efedb8c1bbbdb81035.jpg,android,fingerprint sensor,4g,4gb,oppo,1 year,32gb,gold,oppo f5,16mp,less than 3.5 inches
3,1523303826,pocophone xiaomi f1 ram 6gb rom 128gb,mobile_image/08bb7a43ae0fafd568ad54db908bac0b.jpg,android,touchscreen,4g,6gb,xiaomi,1 year,128gb,black,xiaomi pocophone f1,dua slot,4.6 to 5 inches
4,1524467616,advan g2,mobile_image/970134be2e5961bda177e9761e1cf6ab.jpg,android,fingerprint sensor,4g,3gb,advan,1 month,32gb,white,oppo f9,13mp,more than 5.6 inches
5,1524876429,samsung galaxy a6,mobile_image/ce06b7eb04ff32e0ba9bd8de93b47955.jpg,android,touchscreen,4g,3gb,samsung,18 months,32gb,black,samsung galaxy a6,16mp,more than 5.6 inches
6,1525744340,promo akhir bulan oppo f7 diamond black limite...,mobile_image/9c4ed6c4eb9fe1bbfdb873f527896506.jpg,android,gps,4g,4gb,oppo,2 month,64gb,black,oppo f7,16mp,4.6 to 5 inches
7,1529179598,vivo v9 pro ram 6gb rom 64gb 6,mobile_image/8af91359e6b7f04d8099748d8fbdd200.jpg,android,waterproof,4g,6gb,vivo,1 month,64gb,red,vivo v9,13mp,more than 5.6 inches
8,1529634640,unik telephone panasonic wireless kx tg1611 bl...,mobile_image/f50c21529d6d99a7aa9bafdb56fbe3a8.jpg,android,fingerprint sensor,3g,4gb,hp,1 year,64gb,black,apple iphone 7,single camera,4.1 to 4.5 inches
9,1529634708,promo telepon wireless panasonic kx tg1611 all...,mobile_image/a3f4f39dad3ec46faf58d9f0e8d4d0f5.jpg,android,fingerprint sensor,3g,4gb,hp,1 year,32gb,white,samsung galaxy note 8,single camera,4.1 to 4.5 inches


In [145]:
beauty_json['Brand']

{'selection': 0,
 'bio hair': 124,
 'katy perry': 2,
 'cw 666': 3,
 'random': 4,
 'wet n wild': 5,
 'chanel': 212,
 'pomparians': 6,
 'xl professionnel': 36,
 'herborist': 9,
 'cantiqa kemiri': 10,
 'catok haidi': 11,
 'dettol': 12,
 'hadalabo': 13,
 'pepsodent': 14,
 'pratista': 103,
 'dolce gabbana': 368,
 'evangeline': 16,
 'cocco': 17,
 'scarlett': 18,
 'colourpop': 19,
 'little baby': 20,
 'avoskin': 21,
 'moodmatcher': 22,
 'sasha': 23,
 'davidoff': 24,
 'makarizo viola': 25,
 'miniso': 108,
 'kemei': 28,
 'bebold': 140,
 'erto s': 30,
 'rudy hadisuwarno': 31,
 'kirkland signature': 32,
 'haidi': 33,
 'marc jacobs': 34,
 'jo malone': 35,
 'ellips': 7,
 'l oreal paris': 104,
 'real techniques': 38,
 'musk by lilian ashley': 39,
 'sonar': 40,
 'olay': 41,
 'ql': 42,
 'zwitsbaby': 271,
 'natur': 45,
 'satto': 46,
 'revlon': 47,
 'emina': 48,
 'envygreen': 49,
 'peripera': 50,
 'natrol': 51,
 'champagne': 63,
 'glamglow': 52,
 'elise': 53,
 'paris hilton': 376,
 'la rive': 54,
 'chri

In [None]:
df_beauty_train = pd.read_csv('./data/beauty_data_info_train_competition.csv')


# Train Random Forest

In [149]:
def train_RF_model():
    """
    Random forest model
    """
    X_train, X_test, y_train, y_test = train_test_arrays # Unpack data


    # Preprocess each sentence in the train and test sets
    X_train, X_test = np.array([preprocessor(x) for x in X_train]), np.array([preprocessor(x) for x in X_test])

    # Build pipeline
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', RandomForestClassifier(n_estimators=750))])


    # Define parameters for grid search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'tfidf__use_idf': (True, False),}


    # Grid search across our parameters, scoring by accuracy
    gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, cv=5, iid=False, n_jobs=-1, scoring='accuracy')
    gs_clf.fit(X_train, y_train)

    return gs_clf, X_test

In [237]:
df_sample

Unnamed: 0,id,tagging
0,370855998_Benefits,3 2
1,370855998_Brand,246 98
2,370855998_Colour_group,29 26
3,370855998_Product_texture,3 4
4,370855998_Skin_type,7 7
5,637234604_Benefits,4 2
6,637234604_Brand,338 248
7,637234604_Colour_group,42 15
8,637234604_Product_texture,7 4
9,637234604_Skin_type,5 0


# Train XGBoost

In [178]:
df_fashion_submit = pd.DataFrame(fashion_submit_dict)

In [10]:
for column in beauty_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_beauty_train, column, 0.2)
    X_train = X_train.as_matrix()
    y_train = y_train.as_matrix()
    print (X_train.shape)
    model = XGBClassifier()
    model.fit(X_train[:,np.newaxis],y_train)
    

Now processing for column: Brand
(190502,)


  """
  


ValueError: could not convert string to float: 'nars light reflecting translucent powder'

In [21]:
def build_XGB_model():
    """
    Random XGBoost model
    """
    
    # Build pipeline
    text_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('xgb', XGBClassifier(n_estimators=2, silent = False, n_jobs = -1,
                                               objective = 'softmax'))])


    # Define parameters for grid search
    parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                'tfidf__use_idf': (True, False),
                 'xgb__learning_rate': [0.01, 0.1]}
    


    # Grid search across our parameters, scoring by accuracy
    gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, cv=5, iid=False, n_jobs=-1, scoring='accuracy')
    gs_clf.fit(X_train, y_train)

    return gs_clf

In [24]:
# beauty_feature_columns = [ 'Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type']

beauty_feature_columns = ['Brand']
df_beauty_train = pd.read_csv('../data/beauty_data_info_train_competition.csv')

# df_beauty_train = df_beauty_train.head(n=1500)
for column in beauty_feature_columns:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data(df_beauty_train, column, 0.2)
    XGB_model = train_test_XGB_model('beauty', column, X_train, y_train)

Now processing for column: Brand




KeyboardInterrupt: 

In [12]:
def train_test_XGB_model(category, column, X_train, y_train):
    XGB_model = build_XGB_model()
    XGB_model.fit(X_train, y_train)
    # Evaluation
    best_params = XGB_model.best_params_
    predicted = XGB_model.predict(X_test)
    
    # Print the score
    best_params = best_params
    print('Best Parameters: ', best_params)
    train_acc = round(XGB_model.best_score_, 2)
    print('Train Set Accuracy: ', train_acc)
    test_acc = round(np.mean(predicted == y_test), 2)
    print('Test Set Accuracy: ', test_acc)
    
    with open ('XGB_clf_09032019_DT_{}_{}.pkl'.format(category, column), 'wb') as f:
        pickle.dump(XGB_model, f)
        
    return XGB_model

In [162]:
df_beauty_submit = pd.DataFrame(beauty_submit_dict)

In [210]:
df_fashion_submit['tagging'].dtypes

dtype('float64')

In [224]:
final_submit = df_fashion_submit.append([df_mobile_submit, df_beauty_submit])

In [240]:
final_submit.iloc[1]['tagging']

'1.0 1'

In [250]:
for index, row in final_submit.iterrows():
    row['tagging'] = row['tagging']+" 1"

In [251]:
final_submit['tagging']

0          3 1
1          1 1
2          1 1
3          0 1
4          8 1
5          0 1
6          8 1
7          8 1
8          1 1
9          3 1
10         0 1
11         1 1
12        13 1
13         8 1
14         8 1
15         3 1
16         8 1
17        13 1
18         1 1
19         1 1
20         8 1
21         3 1
22         1 1
23         1 1
24         8 1
25         8 1
26         1 1
27         3 1
28         8 1
29         8 1
          ... 
382695     0 1
382696     2 1
382697     0 1
382698     0 1
382699     0 1
382700     0 1
382701     0 1
382702     0 1
382703     0 1
382704     0 1
382705     0 1
382706     0 1
382707     0 1
382708     0 1
382709     2 1
382710     0 1
382711     3 1
382712     6 1
382713     0 1
382714     2 1
382715     0 1
382716     0 1
382717     4 1
382718     0 1
382719     0 1
382720     0 1
382721     0 1
382722     0 1
382723     0 1
382724     0 1
Name: tagging, Length: 1174802, dtype: object

str

In [33]:
df_1 = df_mobile_train.head(n=5)
df_1.to_csv('mobile_sample.csv')

# TO-DO

In [None]:
# Check invalid RAM