In [4]:
# Import modules
import pandas as pd
import numpy as np
import pickle
import json

In [5]:
# Import sklearn modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import SGDClassifier

In [31]:
# Define categories and category feature columns
categories = ['beauty', 'fashion', 'mobile']
category_feature_columns = {'beauty':[ 'Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type'], 
                   'fashion': ['Collar Type', 'Sleeves', 'Pattern', 'Fashion Trend', 'Clothing Material'],
                   'mobile': ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']}

In [6]:
def array_to_class_string(model, result_array):
    """This function takes output from load_and_predict and find out the class from SGD.classes_ method
    joint with space to fulfill the competition format"""
    class_string_1 = str(int(model.classes_[result_array[0]]))
    class_string_2 = str(int(model.classes_[result_array[1]]))
    result_string = ''
    result_string = result_string + class_string_1 + " " + class_string_2
    return result_string

In [7]:
def load_and_predict_beauty(df_beauty_val, n, column,X_test):
    """This is to load the pkl module and output the 2 classes"""
    # Load the model
    with open('SGD_clf_02032019_{}.pkl'.format(column), 'rb') as f:
        SGD_clf = pickle.load(f)
    # For 1 class only
    predicted = SGD_clf.predict(X_test)
    df_beauty_val[column+'1_class'] = predicted
    
    # Predict the probability for 2 classes
    result_prob = SGD_clf.predict_proba(X_test)
    result_prob_array = np.array([best_n_classes(2, i) for i in result_prob])
    result_prob_2_class = [array_to_class_string(SGD_clf, i) for i in result_prob_array]
    df_beauty_val[column] = pd.Series(result_prob_2_class)
#     for index, row in df_beauty_val.iterrows():
#         row[column] = array_to_class_string(row[column])
    
    return df_beauty_val
    
    
    

In [8]:
def load_and_predict(cat, df, n, column,X_test):
    """This is to load the pkl module and output the 2 classes"""
    # Load the model
    with open('SGD_clf_02032019_{0}_{1}.pkl'.format(cat, column), 'rb') as f:
        SGD_clf = pickle.load(f)
    # For 1 class only
    predicted = SGD_clf.predict(X_test)
    df[column+'1_class'] = predicted
    
    # Predict the probability for 2 classes
    result_prob = SGD_clf.predict_proba(X_test)
    result_prob_array = np.array([best_n_classes(2, i) for i in result_prob])
    result_prob_2_class = [array_to_class_string(SGD_clf, i) for i in result_prob_array]
    df[column] = pd.Series(result_prob_2_class)
#     for index, row in df_beauty_val.iterrows():
#         row[column] = array_to_class_string(row[column])
    
    return df

In [9]:
def best_n_classes(n, full_array):
    return np.flip(np.argpartition(full_array, -n)[-n:])

# Predict Label

This section is the normal label prediction for DEBUG without compliance to competition

In [12]:
df_beauty_train = pd.read_csv('../data/beauty_data_info_train_competition.csv')
df_fashion_train = pd.read_csv('../data/fashion_data_info_train_competition.csv')
df_mobile_train = pd.read_csv('../data/mobile_data_info_train_competition.csv')


In [9]:
def train_test_data(df, label, test_size):
    '''Prepare training and test data'''
    df = df[['title', label]]
    df = df.dropna()
    X = df['title']
    y = df[label]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
    return X_train,X_test,y_train,y_test

In [18]:
# Randomly sample some data
df_beauty_compare = df_beauty_train.sample(frac=0.1) 

X_compare_test = df_beauty_compare['title']
for column in beauty_feature_columns:
    print ("Now comparing for column:", column)
    with open('SGD_clf_02032019_{}.pkl'.format(column), 'rb') as f:
        SGD_clf = pickle.load(f)
    predicted = SGD_clf.predict(X_compare_test)
    df_beauty_compare[column+'_predicted'] = predicted

Now comparing for column: Brand
Now comparing for column: Colour_group
Now comparing for column: Benefits
Now comparing for column: Product_texture
Now comparing for column: Skin_type


In [19]:
def check_column(df, column):
    df_check_result = df[['title', column, column+'_predicted']]
    df_check_result.dropna(subset = [column], inplace = True)
    df_check_result = df_check_result.loc[~(df_check_result[column] == df_check_result[column+'_predicted'])]
    return df_check_result

In [20]:
check_column(df_beauty_compare, 'Brand')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,title,Brand,Brand_predicted
209737,cantik dengan haple nature oil pure rose water...,292.0,383.0
42041,bayar di tempat cq concealer menutupi noda lin...,321.0,237.0
282340,1pc pensil lip liner kosmetik makeup tahan lam...,151.0,390.0
149239,new tull jye day cream super hijau,4.0,138.0
179287,bedak tabur acne kn,30.0,26.0
234287,beauty april skin magic snow cushion black,4.0,240.0
180983,optunal white radiance spf,220.0,41.0
79112,lacoco hydrating divine essence,237.0,399.0
54631,baru set tull jye day night cream 2x20g krim w...,230.0,354.0
286015,lipstick cair warna matte,62.0,237.0


In [3]:
from NLP.util import column_class_to_text

# Competition predict

Comply to competition standard

In [126]:
# For beauty

df_beauty_compare = pd.DataFrame()

for column in beauty_feature_columns:
    
    print("Now processing for column:", column)
    df_beauty_val = load_and_predict(df_beauty_val, 2, column, X_test)

Now processing for column: Brand
Now processing for column: Colour_group
Now processing for column: Benefits
Now processing for column: Product_texture
Now processing for column: Skin_type


In [127]:
df_beauty_val

Unnamed: 0,itemid,title,image_path,Brand1_class,Brand,Colour_group1_class,Colour_group,Benefits1_class,Benefits,Product_texture1_class,Product_texture,Skin_type1_class,Skin_type
0,370855998,flormar 7 white cream bb spf 30 40ml,beauty_image/1588591395c5a254bab84042005f2a9f.jpg,208.0,208 289,6.0,6 1,22.0,22 42,8.0,8 2,4.0,4 0
1,637234604,maybelline clear smooth all in one bb cream sp...,beauty_image/920985ed9587ea20f58686ea74e20f93.jpg,282.0,282 91,6.0,6 1,24.0,24 9,8.0,8 2,1.0,1 6
2,690282890,murah innisfree eco natural green tea bb cream...,beauty_image/90b40e5710f54352b243fcfb0f5d1d7f.jpg,205.0,205 4,1.0,1 6,7.0,7 24,8.0,8 2,1.0,1 6
3,930913462,loreal white perfect day cream spf 17 pa white...,beauty_image/289c668ef3d70e1d929d602d52d5d78a.jpg,104.0,104 220,6.0,6 1,22.0,22 25,8.0,8 2,6.0,6 3
4,1039280071,hada labo cc cream ultimate anti aging spf 35 ...,beauty_image/d5b3e652c5822d2306f4560488ec30c6.jpg,4.0,4 13,6.0,6 3,9.0,9 11,8.0,8 2,7.0,7 4
5,1327710392,cathy doll cc speed white powder pact spf 40 o...,beauty_image/e1e50828d5594721a7d5d5c1ff78afbd.jpg,37.0,37 147,6.0,6 4,22.0,22 11,6.0,6 8,6.0,6 3
6,1328802799,safi white natural brightening cream 45g,beauty_image/97ec852d5afc5d82ac02b80083cf292f.jpg,222.0,222 289,1.0,1 6,22.0,22 13,8.0,8 2,6.0,6 0
7,1330468145,light beige 03 bioaqua bb cushion exquisite de...,beauty_image/8ce1a5fe546f0cc795329bad599a8d5a.jpg,199.0,199 146,3.0,3 1,11.0,11 20,7.0,7 8,1.0,1 0
8,1677309730,new produk missha m perfect bb cream share in ...,beauty_image/755fcc85c687e8cb53d2a8d43ebfe251.jpg,91.0,91 1,1.0,1 4,11.0,11 13,8.0,8 2,6.0,6 7
9,1683142205,ready laneige bb cushion anti aging spf 50 pa,beauty_image/34b56398c099505c650cf2447dc9f21f.jpg,375.0,375 147,6.0,6 1,9.0,9 11,7.0,7 8,7.0,7 1


In [129]:
# For fashion

X_test = df_fashion_val['title']

for column in fashion_feature_columns:
    print("Now processing for column:", column)
    df_fashion_val = load_and_predict('fashion', df_fashion_val, 2, column, X_test)

Now processing for column: Collar Type
Now processing for column: Sleeves
Now processing for column: Pattern
Now processing for column: Fashion Trend
Now processing for column: Clothing Material


In [131]:
df_fashion_val

Unnamed: 0,itemid,title,image_path,Collar Type1_class,Collar Type,Sleeves1_class,Sleeves,Pattern1_class,Pattern,Fashion Trend1_class,Fashion Trend,Clothing Material1_class,Clothing Material
0,381034175,fashion wanita cardigan drape terbuka lengan p...,new_fashion_image/9ba6bf09ae89c2b9069faf569e7a...,3.0,3 8,3.0,3 2,5.0,5 18,7.0,7 8,3.0,3 7
1,396355150,bayar di tempat fashion wanita fg sweater hood...,new_fashion_image/f465cd2e55352e3ed9ab49b16257...,1.0,1 6,3.0,3 2,6.0,6 7,7.0,7 4,7.0,7 18
2,592583745,bayar di tempat cardigan sweater model lengan ...,new_fashion_image/18d709b8be0d8385689bbf5e0ab2...,1.0,1 8,3.0,3 2,6.0,6 7,10.0,10 7,1.0,1 18
3,721929368,blazer lengan panjang slim warna polos dengan ...,new_fashion_image/5c861844f6e497f0a56b8e37138c...,0.0,0 8,3.0,3 2,6.0,6 7,7.0,7 10,7.0,7 18
4,800794259,sweater rajut longgar lengan panjang warna polos,new_fashion_image/13fbe2dcbaa567939acf708d6d8e...,8.0,8 1,3.0,3 2,6.0,6 7,10.0,10 7,18.0,18 1
5,959172548,blazer kerah lapel lengan panjang dengan saku ...,new_fashion_image/3ca9d6f525ae21e0a966b63fae5c...,0.0,0 6,3.0,3 2,18.0,18 6,1.0,1 10,7.0,7 18
6,962038495,jaket lace lengan panjang slim motif bunga unt...,new_fashion_image/ae812b232c3c7fdde11505cf534a...,8.0,8 1,3.0,3 2,2.0,2 6,10.0,10 1,3.0,3 7
7,1029774483,eileen elisa sexy tombol sweater wanita pullov...,new_fashion_image/b4fd3bc71c0ebd90c977100715f5...,8.0,8 1,3.0,3 2,18.0,18 16,7.0,7 10,7.0,7 18
8,1067777027,bayar di tempat kaos t shirt dengan hoodie dan...,new_fashion_image/275e82c90f97aee1912cf64a597c...,1.0,1 10,0.0,0 3,14.0,14 7,6.0,6 1,7.0,7 18
9,1270302483,sweater kaos pullover crew neck lengan panjang...,new_fashion_image/2d8de1e482ac11ba44a3f3818bf9...,3.0,3 15,3.0,3 2,18.0,18 14,6.0,6 4,7.0,7 18


In [150]:
# For mobile

X_test = df_mobile_val['title']

for column in mobile_feature_columns:
    print("Now processing for column:", column)
    df_mobile_val = load_and_predict('mobile', df_mobile_val, 2, column, X_test)

Now processing for column: Operating System
Now processing for column: Features
Now processing for column: Network Connections
Now processing for column: Memory RAM
Now processing for column: Brand
Now processing for column: Warranty Period
Now processing for column: Storage Capacity
Now processing for column: Color Family
Now processing for column: Phone Model
Now processing for column: Camera
Now processing for column: Phone Screen Size


In [132]:
df_mobile_val

Unnamed: 0,itemid,title,image_path,Operating System,Features,Network Connections,Memory RAM,Brand,Warranty Period,Storage Capacity,...,Features1_class,Network Connections1_class,Memory RAM1_class,Brand1_class,Warranty Period1_class,Storage Capacity1_class,Color Family1_class,Phone Model1_class,Camera1_class,Phone Screen Size1_class
0,2346660,apple iphone 4s back glass spare part original...,mobile_image/a9c8f0fdd6587deed197634066cf7eee.jpg,1 6,0 2,0 2,9 3,2 18,13 11,12 3,...,0.0,0.0,9.0,2.0,13.0,12.0,12.0,1526.0,5.0,4.0
1,2816338,iphone 4s 64gb white,mobile_image/3b9a11608551b11b9330268e0d055e01.jpg,1 6,2 0,2 0,8 9,2 18,3 13,3 16,...,2.0,2.0,8.0,2.0,3.0,3.0,12.0,1526.0,2.0,4.0
2,2847602,samsung sm b310e piton dual sim,mobile_image/1d719e936841a83c165da620f927de68.jpg,6 1,5 2,2 1,9 2,43 18,13 5,12 4,...,5.0,2.0,9.0,43.0,13.0,12.0,12.0,402.0,6.0,4.0
3,3116949,samsung caramel gt e1272 dual sim 32 mb putih,mobile_image/1d35a74d90df6cf4a02e6a5df9e9ff29.jpg,6 1,5 2,1 3,3 8,43 18,13 5,10 12,...,5.0,1.0,3.0,43.0,13.0,10.0,12.0,1480.0,1.0,4.0
4,3794648,garskin sony experia z z1 z2 ultra,mobile_image/5556577b09539a9c0db0d00e0f171e2d.jpg,6 1,0 6,0 3,6 9,38 18,13 11,12 1,...,0.0,0.0,6.0,38.0,13.0,12.0,10.0,601.0,6.0,5.0
5,4980072,lcd xiaomi redmi 4+touchscreen,mobile_image/504bbab21ede157e6e3f1b93e6b6484c.jpg,6 1,0 1,0 3,5 9,33 19,13 3,12 3,...,0.0,0.0,5.0,33.0,13.0,12.0,10.0,376.0,1.0,0.0
6,5769008,samsung caramel gt e1272 dual sim 32mb black,mobile_image/e088ca5ebb1ab5ba90a8cff8f9c4f791.jpg,6 1,5 2,1 3,3 8,43 18,13 5,10 12,...,5.0,1.0,3.0,43.0,13.0,10.0,10.0,949.0,1.0,4.0
7,9503620,iphone 4g 8gb,mobile_image/a23f0381039e5595559be27db3271d2f.jpg,1 6,0 2,0 3,3 2,2 18,13 5,12 16,...,0.0,0.0,3.0,2.0,13.0,12.0,10.0,177.0,1.0,2.0
8,17937158,blackberry torch 1 9800 gsm garansi distributo...,mobile_image/7803e0e63b5972e14b6ff564679f941c.jpg,6 1,0 5,1 2,5 6,51 2,2 13,12 3,...,0.0,1.0,5.0,51.0,2.0,12.0,12.0,1753.0,2.0,4.0
9,21715801,samsung keystone 3 sm b109e,mobile_image/a5360d928a586de4b7dc5a8463f9fc26.jpg,4 6,5 2,1 3,6 8,43 18,13 5,10 1,...,5.0,1.0,6.0,43.0,13.0,10.0,12.0,559.0,6.0,4.0


In [143]:
# Make submission file

fashion_submit_dict = {}
fashion_submit_dict['id'] = []
fashion_submit_dict['tagging'] = []

beauty_submit_dict = {}
beauty_submit_dict['id'] = []
beauty_submit_dict['tagging'] = []

mobile_submit_dict = {}
mobile_submit_dict['id'] = []
mobile_submit_dict['tagging'] = []

for column in fashion_feature_columns:
    for index, row in df_fashion_val.iterrows():
        fashion_submit_dict['id'].append(str(row['itemid']) + "_" + column)
        fashion_submit_dict['tagging'].append(row[column])

for column in beauty_feature_columns:
    for index, row in df_beauty_val.iterrows():
        beauty_submit_dict['id'].append(str(row['itemid']) + "_" + column)
        beauty_submit_dict['tagging'].append(row[column])

for column in mobile_feature_columns:
    for index, row in df_mobile_val.iterrows():
        mobile_submit_dict['id'].append(str(row['itemid']) + "_" + column)
        mobile_submit_dict['tagging'].append(row[column])

In [152]:
df_fashion_submit = pd.DataFrame(fashion_submit_dict)
df_beauty_submit = pd.DataFrame(beauty_submit_dict)
df_mobile_submit = pd.DataFrame(mobile_submit_dict)

final_submit = df_fashion_submit.append([df_mobile_submit, df_beauty_submit])
final_submit.to_csv('submission_0503_1.csv', index = False)


In [2]:
pwd

'/Users/platinumanalytics/Desktop/ndsc2019/NLP'

In [21]:
df_na = pd.read_csv('submission_0503_1.csv')

In [22]:
df_na['tagging'] = np.nan

In [26]:
df_na.to_csv('testing.csv', index = False)

In [29]:
for cat in categories:
    for column in '{}_feature_columns'.format(cat):
        print (column)

b
e
a
u
t
y
_
f
e
a
t
u
r
e
_
c
o
l
u
m
n
s
f
a
s
h
i
o
n
_
f
e
a
t
u
r
e
_
c
o
l
u
m
n
s
m
o
b
i
l
e
_
f
e
a
t
u
r
e
_
c
o
l
u
m
n
s


In [1]:
import pandas as pd