In [1]:
# Import modules
import pandas as pd
import numpy as np
import pickle
import json

In [2]:
# Import sklearn modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import TransformerMixin
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import FunctionTransformer,OneHotEncoder

In [3]:
# Import Classifiers

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [4]:
# Define categories and category feature columns
categories = ['beauty', 'fashion', 'mobile']
category_feature_columns = {'beauty':[ 'Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type'], 
                   'fashion': ['Collar Type', 'Sleeves', 'Pattern', 'Fashion Trend', 'Clothing Material'],
                   'mobile': ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']}

In [5]:

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [6]:
def train_test_data_title(df, label, test_size):
    '''Prepare training and test data'''
    df = df[['title_processed', label]]
    df = df.dropna()
    X = df['title_processed']
    y = df[label]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
    return X_train,X_test,y_train,y_test

In [7]:
def get_title(df):
    return df['title_processed']
def get_language(df):
    return df['language_processed'].astype(category)

In [8]:
### Train the voting model
def train_test_voting_model(language, category, target_column, X_train, y_train):
    voting_model = build_voting_model()
    voting_model.fit(X_train, y_train)
    # Evaluation
    best_params = voting_model.best_params_
    predicted = voting_model.predict(X_test)
    #print('Voting Classifiers: ', voting_model.get_params())
    print('Best Parameters: ', best_params)
    print('Train Set Accuracy: ', round(voting_model.best_score_, 2))
    print('Test Set Accuracy: ', round(np.mean(predicted == y_test), 2))
    
    with open ('voting_clf_23032019_{}_{}_{}.pkl'.format(category, target_column, language), 'wb') as f:
        pickle.dump(voting_model, f)
        
    return voting_model


In [9]:
def build_voting_model():
    """
    Build the Voting model
    """ 

    LOG_clf = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', random_state = 1)
    RF_clf = RandomForestClassifier(n_estimators = 20, random_state = 1)
    NB_clf = GaussianNB()
    MLP_clf = MLPClassifier(alpha = 1)
    SVC_clf = SVC() # Using RBF Kernel
    ADA_clf = AdaBoostClassifier()
    #QDA_clf = QuadraticDiscriminantAnalysis()
    XGB_clf = XGBClassifier(n_estimators=20, silent = False, objective = 'softmax')
    SGD_clf = SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, class_weight='balanced')
    
    vot_clf = VotingClassifier(estimators = [
        ('LOG_clf', LOG_clf),
        ('RF_clf', RF_clf),
#         ('NB_clf', NB_clf),
        ('MLP_clf', MLP_clf),
        ('SVC_clf', SVC_clf),
        ('ADA_clf', ADA_clf),
#         ('QDA_clf', QDA_clf),
        ('XGB_clf', XGB_clf),
        ('SGD_clf', SGD_clf)
    ])


    text_clf = Pipeline([
        ('vect', CountVectorizer(token_pattern='\\b\\w+\\b')),
        ('tfidf', TfidfTransformer()),
#         ('to_dense', DenseTransformer()),
        ('vot_clf', vot_clf)])
    # Define parameters for grid search
    parameters = {'vect__ngram_range': [(1, 2)],
                  'tfidf__use_idf':(True,False),
                 }


    # Grid search across our parameters, scoring by accuracy
    gs_clf = GridSearchCV(estimator=text_clf, param_grid=parameters, verbose = 100, cv= ShuffleSplit(test_size=0.01, n_splits=1)
                          , iid=False, n_jobs=2, pre_dispatch = '2*n_jobs', scoring='accuracy')

    return gs_clf

In [10]:
df_beauty_train = pd.read_csv('beauty_train_processed.csv')
df_fashion_train = pd.read_csv('fashion_train_processed.csv')
df_mobile_train = pd.read_csv('mobile_train_processed.csv')


In [12]:
language_list = ['en','id']

# Train Beauty 

In [None]:

# Add lanaguage to title
#df_beauty_train['title'] = df_beauty_train[['title_processed', 'language_processed']].apply(lambda x: ' '.join(x), axis=1)



for language in language_list:
    df_beauty_train_lan = df_beauty_train[df_beauty_train['language_processed'] ==language]
    for column in category_feature_columns['beauty']:
        print ("Now processing for column:", column, 'with language:', language)
        # Unpack data
        X_train,X_test,y_train,y_test = train_test_data_title(df_beauty_train_lan, column, 0.001)
        voting_model = train_test_voting_model(language, 'beauty', column, X_train, y_train)
    
    

Now processing for column: Brand
Fitting 1 folds for each of 2 candidates, totalling 2 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
Memmapping (shape=(235746,), dtype=int64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8526237029/942-5195785160-ef9f7169db6344d68a183dc24ba33601.pkl
Pickling array (shape=(235746,), dtype=object).
Memmapping (shape=(235746,), dtype=int64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8526237029/942-5195785160-a2d8dae3d2c54961a803b2cbac86b382.pkl
Memmapping (shape=(235746,), dtype=int64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8526237029/942-5195785160-ace151c7ee0742529561c24e53053ddb.pkl
Memmapping (shape=(235746,), dtype=float64) to new file /var/folders/z3/gv1sb1m56275_y6ws_wx4v240000gn/T/joblib_memmapping_folder_942_8526237029/942-5195785160-bae258b9636d40e48ecdbca85990de37.pkl
Mem

# Train Fashion

In [None]:
for language in language_list:
    df_fashion_train_lan = df_fashion_train[df_fashion_train['language_processed'] ==language]
    for column in category_feature_columns['fashion']:
        print ("Now processing for column:", column, 'with language:', language)
        # Unpack data
        X_train,X_test,y_train,y_test = train_test_data_title(df_fashion_train_lan, column, 0.001)
        voting_model = train_test_voting_model(language, 'fashion', column, X_train, y_train)

Now processing for column: Collar Type with language: en
Fitting 1 folds for each of 2 candidates, totalling 2 fits
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41571,), dtype=object).
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41571,), dtype=float64).
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41155,), dtype=int64).
Pickling array (shape=(416,), dtype=int64).
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41571,), dtype=object).
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41571,), dtype=float64).
Pickling array (shape=(41571,), dtype=int64).
Pickling array (shape=(41155,), dtype=int64).
Pickling array (shape=(416,), dtype=int64).


# Train Mobile

In [None]:
for language in language_list:
    df_mobile_train_lan = df_mobile_train[df_mobile_train['language_processed'] ==language]
    for column in category_feature_columns['mobile']:
        print ("Now processing for column:", column, 'with language:', language)
        # Unpack data
        X_train,X_test,y_train,y_test = train_test_data_title(df_mobile_train_lan, column, 0.001)
        voting_model = train_test_voting_model(language, 'mobile', column, X_train, y_train)