In [2]:
# Import modules
import pandas as pd
import numpy as np
import pickle
import json

In [3]:
# Import sklearn modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import FunctionTransformer,OneHotEncoder

In [4]:
# Import Classifiers

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [5]:
# Define categories and category feature columns
categories = ['beauty', 'fashion', 'mobile']
category_feature_columns = {'beauty':[ 'Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type'], 
                   'fashion': ['Collar Type', 'Sleeves', 'Pattern', 'Fashion Trend', 'Clothing Material'],
                   'mobile': ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']}

In [6]:

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [7]:
def train_test_data_title(df, label, test_size):
    '''Prepare training and test data'''
    df = df[['title', label]]
    df = df.dropna()
    X = df['title']
    y = df[label]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
    return X_train,X_test,y_train,y_test

In [8]:
def get_title(df):
    return df['title_processed']
def get_language(df):
    return df['language_processed'].astype(category)

In [9]:
### Train the voting model
def train_test_voting_model(category, target_column, X_train, y_train):
    voting_model = build_voting_model()
    voting_model.fit(X_train, y_train)
    # Evaluation
    best_params = voting_model.best_params_
    predicted = voting_model.predict(X_test)
    #print('Voting Classifiers: ', voting_model.get_params())
    print('Best Parameters: ', best_params)
    print('Train Set Accuracy: ', round(voting_model.best_score_, 2))
    print('Test Set Accuracy: ', round(np.mean(predicted == y_test), 2))
    
    with open ('voting_clf_23032019_{}_{}.pkl'.format(category, target_column), 'wb') as f:
        pickle.dump(voting_model, f)
        
    return voting_model


In [10]:
def build_voting_model():
    """
    Build the Voting model
    """ 

    LOG_clf = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', random_state = 1)
    RF_clf = RandomForestClassifier(n_estimators = 20, random_state = 1)
    NB_clf = GaussianNB()
    MLP_clf = MLPClassifier(alpha = 1)
    SVC_clf = SVC() # Using RBF Kernel
    ADA_clf = AdaBoostClassifier()
    #QDA_clf = QuadraticDiscriminantAnalysis()
    XGB_clf = XGBClassifier(n_estimators=20, silent = False, objective = 'softmax')
    SGD_clf = SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, class_weight='balanced')
    
    vot_clf = VotingClassifier(estimators = [
        ('LOG_clf', LOG_clf),
#         ('RF_clf', RF_clf),
        ('NB_clf', NB_clf),
        ('MLP_clf', MLP_clf),
        ('SVC_clf', SVC_clf),
        ('ADA_clf', ADA_clf),
#         ('QDA_clf', QDA_clf),
#         ('XGB_clf', XGB_clf),
        ('SGD_clf', SGD_clf)
    ])


    text_clf = Pipeline([
        ('vect', CountVectorizer(token_pattern='\\b\\w+\\b')),
        ('tfidf', TfidfTransformer()),
        ('to_dense', DenseTransformer()),
        ('vot_clf', vot_clf)])
    # Define parameters for grid search
    parameters = {'vect__ngram_range': [(1, 2)]
                 }


    # Grid search across our parameters, scoring by accuracy
    gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, verbose = 100, cv= ShuffleSplit(test_size=0.01, n_splits=1)
                          , iid=False, n_jobs=2, pre_dispatch = '2*n_jobs', scoring='accuracy')

    return gs_clf

In [11]:
df_beauty_train = pd.read_csv('beauty_train_processed.csv')
df_fashion_train = pd.read_csv('fashion_train_processed.csv')
df_mobile_train = pd.read_csv('mobile_train_processed.csv')


# Train Beauty 

In [12]:

# Add lanaguage to title
df_beauty_train['title'] = df_beauty_train[['title_processed', 'language_processed']].apply(lambda x: ' '.join(x), axis=1)


for column in category_feature_columns['beauty']:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data_title(df_beauty_train, column, 0.01)
    voting_model = train_test_voting_model('beauty', column, X_train, y_train)
    
    

Now processing for column: Brand
Fitting 1 folds for each of 1 candidates, totalling 1 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
Memmapping (shape=(235746,), dtype=int64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8858421951/942-5104804752-18004da9a0544893b23f1dc5ffa79b77.pkl
Pickling array (shape=(235746,), dtype=object).
Memmapping (shape=(235746,), dtype=int64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8858421951/942-5104804752-cf335b9952e344ff91337cb450d89db0.pkl
Memmapping (shape=(235746,), dtype=int64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8858421951/942-5104804752-45ca11da9ac14783900d0f988d9fa010.pkl
Memmapping (shape=(235746,), dtype=float64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8858421951/942-5104804752-051897289b064318921898568024fe12.pkl
Mem

KeyboardInterrupt: 

# Train Fashion

In [20]:

# Add lanaguage to title
df_fashion_train['title'] = df_fashion_train[['title_processed', 'language_processed']].apply(lambda x: ' '.join(x), axis=1)


for column in category_feature_columns['fashion']:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data_title(df_fashion_train, column, 0.01)
    voting_model = train_test_voting_model('fashion', column, X_train, y_train)

Now processing for column: Collar Type
Fitting 1 folds for each of 1 candidates, totalling 1 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
Pickling array (shape=(112501,), dtype=int64).
Pickling array (shape=(112501,), dtype=object).
Pickling array (shape=(112501,), dtype=int64).
Pickling array (shape=(112501,), dtype=int64).
Pickling array (shape=(112501,), dtype=float64).
Pickling array (shape=(112501,), dtype=int64).
Pickling array (shape=(111375,), dtype=int64).
Pickling array (shape=(1126,), dtype=int64).
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   58.1s


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker. The exit codes of the workers are {SIGKILL(-9)}

# Train Mobil

In [None]:

# Add lanaguage to title
df_mobile_train['title'] = df_mobile_train[['title_processed', 'language_processed']].apply(lambda x: ' '.join(x), axis=1)


for column in category_feature_columns['mobile']:
    print ("Now processing for column:", column)
    # Unpack data
    X_train,X_test,y_train,y_test = train_test_data_title(df_mobile_train, column, 0.01)
    voting_model = train_test_voting_model('mobile', column, X_train, y_train)