In [1]:
# Import modules
import pandas as pd
import numpy as np
import pickle
import json

In [2]:
# Import sklearn modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import SGDClassifier

In [3]:
from NLP.util import category_json, category_feature_columns
from NLP.util import train_test_data
from NLP.util import column_class_to_text, column_text_to_class, column_class_to_text_debug

In [4]:
# Pandas printing setting
desired_width=320
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns',10)
pd.set_option('display.max_colwidth', 0)

In [5]:
def array_to_class_list(model, result_array):
    """Return a tuple of class_1,class_2"""
    class_string_1 = str(int(model.classes_[result_array[0]]))
    class_string_2 = str(int(model.classes_[result_array[1]]))
    return class_string_1,class_string_2

In [6]:
def array_to_class_string(model, result_array):
    """This function takes output from load_and_predict and find out the class from SGD.classes_ method
    joint with space to fulfill the competition format"""
    class_string_1 = str(int(model.classes_[result_array[0]]))
    class_string_2 = str(int(model.classes_[result_array[1]]))
    result_string = ''
    result_string = result_string + class_string_1 + " " + class_string_2
    return result_string

In [7]:
def load_and_predict_beauty(df_beauty_val, n, column,X_test):
    """This is to load the pkl module and output the 2 classes"""
    # Load the model
    with open('SGD_clf_02032019_{}.pkl'.format(column), 'rb') as f:
        SGD_clf = pickle.load(f)
    # For 1 class only
    predicted = SGD_clf.predict(X_test)
    df_beauty_val[column+'1_class'] = predicted
    
    # Predict the probability for 2 classes
    result_prob = SGD_clf.predict_proba(X_test)
    result_prob_array = np.array([best_n_classes(2, i) for i in result_prob])
    result_prob_2_class = [array_to_class_string(SGD_clf, i) for i in result_prob_array]
    df_beauty_val[column] = pd.Series(result_prob_2_class)
#     for index, row in df_beauty_val.iterrows():
#         row[column] = array_to_class_string(row[column])
    
    return df_beauty_val
    
    
    

In [8]:
def load_and_predict(cat, df, n, column,X_test):
    """This is to load the pkl module and output the 2 classes"""
    # Load the model
    with open('./NLP/SGD_clf_21032019_{0}_{1}.pkl'.format(cat, column), 'rb') as f:
        SGD_clf = pickle.load(f)
    # For 1 class only
    predicted = SGD_clf.predict(X_test)
    df[column+'1_class'] = predicted
    
    # Predict the probability for 2 classes
    result_prob = SGD_clf.predict_proba(X_test)
    result_prob_array = np.array([best_n_classes(2, i) for i in result_prob])
    result_prob_2_class = [array_to_class_string(SGD_clf, i) for i in result_prob_array]
    df[column] = pd.Series(result_prob_2_class)
#     for index, row in df_beauty_val.iterrows():
#         row[column] = array_to_class_string(row[column])
    
    return df

In [9]:
def load_and_predict_debug(cat, df, n, column,X_test):
    """This is to load the pkl module and output the 2 classes
    This function is used for debug mode
    """
    # Load the model
    with open('./NLP/SGD_clf_21032019_{0}_{1}.pkl'.format(cat, column), 'rb') as f:
        SGD_clf = pickle.load(f)
    
    # Predict the probability for 2 classes
    result_prob = SGD_clf.predict_proba(X_test)
    result_prob_2_array = np.array([best_n_prob(2, i) for i in result_prob])
    result_class = np.array([best_n_classes(2, i) for i in result_prob])
    result_class_2_array = [array_to_class_list(SGD_clf, i) for i in result_class]
    
    class_1 = np.array([item[0] for item in result_class_2_array])
    class_2 = np.array([item[1] for item in result_class_2_array])
    
    prob_1 = np.array([item[0] for item in result_prob_2_array])
    prob_2 = np.array([item[1] for item in result_prob_2_array])
    
    df[column+'_predicted_1'] = class_1
    df[column+'_predicted_1_prob'] = prob_1
    df[column+'_predicted_2'] = class_2
    df[column+'_predicted_2_prob'] = prob_2
#     for index, row in df_beauty_val.iterrows():
#         row[column] = array_to_class_string(row[column])
    
    return df

In [10]:
def best_n_classes(n, full_array):
    return np.flip(np.argpartition(full_array, -n)[-n:])

In [11]:
def best_n_prob(n, full_array):
    return np.flip(full_array[np.argpartition(full_array, -n)[-n:]])

In [12]:
def check_column(df, column):
    df_check_result = df[['title', column, column+'_predicted_1']]
    df_check_result.dropna(subset = [column], inplace = True)
    df_check_result = df_check_result.loc[~(df_check_result[column] == df_check_result[column+'_predicted_1'])]
    return df_check_result

In [13]:
def check_column_debug(df, column):
    df_check_result = df[['title', column, column+'_predicted_1', column+'_predicted_1_prob', column+'_predicted_2', column+'_predicted_2_prob']]
    df_check_result.dropna(subset = [column], inplace = True)
    df_check_result[column] = df_check_result[column].astype(int).astype(str)
    df_check_result = df_check_result.loc[~(df_check_result[column] == df_check_result[column+'_predicted_1'])]
    return df_check_result

# Predict Label

This section is the normal label prediction for DEBUG without compliance to competition

In [14]:
df_beauty_competition = pd.read_csv('./data/beauty_data_info_val_competition.csv')
df_fashion_competition = pd.read_csv('./data/fashion_data_info_val_competition 2.csv')
df_mobile_competition = pd.read_csv('./data/mobile_data_info_val_competition.csv')


In [15]:
df_beauty_debug = pd.read_csv('./data/beauty_data_info_train_competition.csv')
df_fashion_debug = pd.read_csv('./data/fashion_data_info_train_competition_new.csv')
df_mobile_debug = pd.read_csv('./data/mobile_data_info_train_competition.csv')

In [16]:
df_fashion_compare = df_fashion_debug.sample(frac=0.1)

X_compare_test = df_fashion_compare['title']
for column in category_feature_columns['fashion']:
    print ("Now comparing for column:", column)
    df_fashion_compare = load_and_predict_debug('fashion', df_fashion_compare, 2, column,X_compare_test)

Now comparing for column: Collar Type
Now comparing for column: Sleeves
Now comparing for column: Pattern
Now comparing for column: Fashion Trend
Now comparing for column: Clothing Material


In [49]:
df_pattern_check = check_column_debug(df_fashion_compare, 'Pattern')
# df_pattern_check = column_class_to_text_debug(df_pattern_check, category='fashion', column = 'Pattern')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [50]:
def column_class_to_text_debug(df, category, column):
    """This function is to convert the dataframe with only one single feature column into text
    This is used in the debugging mode
    """
    map_json = category_json[category]
    column_map = {}
    column_map[column] = {v: k for k, v in map_json[column].items()}
    df[column] = df[column].astype(int)
    df[column+'_predicted_1'] = df[column+'_predicted_1'].astype(int)
    df[column+'_predicted_2'] = df[column+'_predicted_2'].astype(int)
    df.loc[:, column] = df[column].map(column_map[column])
    df.loc[:, column+'_predicted_1'] = df[column+'_predicted_1'].map(column_map[column])
    df.loc[:, column + '_predicted_2'] = df[column + '_predicted_2'].map(column_map[column])
    return df

In [68]:
def check_result(df, category, column):
    df_check = check_column_debug(df, column)
    df_check = column_class_to_text_debug(df_check, category=category, column = column)
    return df_check

In [60]:
df_collar_check = check_result('Collar Type')
df_pattern_check = check_result('Pattern')
df_sleeves_check = check_result('Sleeves')
df_fashion_check = check_result('Fashion Trend')
df_clothing_check = check_result('Clothing Material')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [61]:
df_clothing_check

Unnamed: 0,title,Clothing Material,Clothing Material_predicted_1,Clothing Material_predicted_1_prob,Clothing Material_predicted_2,Clothing Material_predicted_2_prob
270613,import atasan kemeja wanita lengan 3 4 putih hitam garis stripe salur,polyester,cotton,0.644074,rayon,0.355103
49168,blus lengan panjang casual warna polos,cotton,denim,0.328076,polyester,0.315149
171478,sm long dress casual v neck tanpa lengan tali spaghetti warna polos untuk wanita musim panas,cotton,polyester,0.598429,cotton,0.333044
42337,kemeja wanita casual lengan panjang model longgar motif print garis untuk musim gugur,polyester,cotton,0.509926,polyester,0.489679
107134,dress o neck lengan pendek sexy casual motif print,cotton,polyester,0.565620,cotton,0.222376
32321,kaos t shirt o neck lengan pendek untuk musim panas,cotton,rayon,0.372534,polyester,0.350602
248217,blus casual wanita model batwing dengan kerah crew neck dan potongan longgar warna polos,cotton,polyester,0.411168,denim,0.316252
148801,print tied v back sleeveless sheath dress,chiffon,polyester,0.788914,rayon,0.142541
217479,blus wanita model off shoulder lengan panjang dengan potongan slim dan gambar motif ular,cotton,polyester,0.409284,cotton,0.379574
137332,dress evelyn baju pesta mini murah lengan panjang pita sale ll,lace,cotton,0.346899,rayon,0.258794


In [75]:
# Randomly sample some data

df_beauty_compare = df_beauty_debug.sample(frac=0.1) 

X_compare_test = df_beauty_compare['title']
for column in category_feature_columns['beauty']:
    print ("Now comparing for column:", column)
    df_beauty_compare = load_and_predict_debug('beauty', df_beauty_compare, 2, column,X_compare_test)

# df_beauty_compare = df_beauty_train.sample(frac=0.1) 

# X_compare_test = df_beauty_compare['title']
# for column in category_feature_columns['beauty']:
#     print ("Now comparing for column:", column)
#     with open('./NLP/model/SGD_clf_21032019_{}.pkl'.format(column), 'rb') as f:
#         SGD_clf = pickle.load(f)
#     predicted = SGD_clf.predict(X_compare_test)
#     df_beauty_compare[column+'_predicted'] = predicted

Now comparing for column: Brand
Now comparing for column: Colour_group
Now comparing for column: Benefits
Now comparing for column: Product_texture
Now comparing for column: Skin_type


In [76]:
df_beauty_competition

Unnamed: 0,itemid,title,image_path
0,370855998,flormar 7 white cream bb spf 30 40ml,beauty_image/1588591395c5a254bab84042005f2a9f.jpg
1,637234604,maybelline clear smooth all in one bb cream spf 21,beauty_image/920985ed9587ea20f58686ea74e20f93.jpg
2,690282890,murah innisfree eco natural green tea bb cream spf25 pa eksklusif,beauty_image/90b40e5710f54352b243fcfb0f5d1d7f.jpg
3,930913462,loreal white perfect day cream spf 17 pa whitening+even tone murah,beauty_image/289c668ef3d70e1d929d602d52d5d78a.jpg
4,1039280071,hada labo cc cream ultimate anti aging spf 35 pa 25gr pilih warna,beauty_image/d5b3e652c5822d2306f4560488ec30c6.jpg
5,1327710392,cathy doll cc speed white powder pact spf 40 original 100,beauty_image/e1e50828d5594721a7d5d5c1ff78afbd.jpg
6,1328802799,safi white natural brightening cream 45g,beauty_image/97ec852d5afc5d82ac02b80083cf292f.jpg
7,1330468145,light beige 03 bioaqua bb cushion exquisite delicate plus refill,beauty_image/8ce1a5fe546f0cc795329bad599a8d5a.jpg
8,1677309730,new produk missha m perfect bb cream share in jar 5 gr natural beige no.23 barang terjamin,beauty_image/755fcc85c687e8cb53d2a8d43ebfe251.jpg
9,1683142205,ready laneige bb cushion anti aging spf 50 pa,beauty_image/34b56398c099505c650cf2447dc9f21f.jpg


In [77]:
df_brand_check = check_result(df_beauty_compare, 'beauty','Brand')
df_colour_check = check_result(df_beauty_compare,'beauty','Colour_group')
df_benefits_check = check_result(df_beauty_compare,'beauty','Benefits')
df_product_check = check_result(df_beauty_compare,'beauty','Product_texture')
df_skin_check = check_result(df_beauty_compare,'beauty','Skin_type')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [80]:
df_colour_check

Unnamed: 0,title,Colour_group,Colour_group_predicted_1,Colour_group_predicted_1_prob,Colour_group_predicted_2,Colour_group_predicted_2_prob
66583,wardah instaperfect bb cushion 13 beige,emas,8 color,0.452947,1 warna,0.334817
155433,wardah white secret night cream,1 warna,9 color,0.833938,1 warna,0.166062
68555,revlon new complexion two way foundation medium beige,maroon,emas,0.347076,maroon,0.346553
99554,bedak ertos,maroon,emas,0.583653,maroon,0.416347
182915,etude house zero sebum drying powder,maroon,1 warna,0.896801,maroon,0.052956
48081,viva compact powder kuning langsat,emas,1 warna,0.665024,emas,0.298546
192179,innisfree water fit cushion with case preloved,emas,8 color,0.510781,emas,0.405857
242738,etude house baking powder crunch pore scrub,emas,1 warna,0.507306,emas,0.492694
82702,etude house precious mineral bb cream,1 warna,8 color,0.573909,1 warna,0.364513
112246,innisfree no sebum mineral powder,emas,1 warna,0.420462,warna merah cabai,0.335202


In [25]:
df_brand_check

Unnamed: 0,title,Brand,Brand_predicted
104,krim kecantikan untuk muka mata bentuk cair mencerahkan dengan kombinasi kontur xiu yan,etude house,benefit
144349,ready elf e.l.f cosmetics blush palette light powder 0.56 oz 16 gr g 16gr,e.l.f,random
263349,lipstick keren tahan lama dengan pelembab,benefit,selection
152498,nortshow lipgloss cair warna matte tahan lama,the balm,benefit
221184,acne glowing htcdh cream malam,theraskin,random
273149,bayar di tempat lip liquid lipstick matte pensil tahan lama anti air makeup kacantikan,the balm,benefit
162574,precious flower bb cream spf 30 pa,benefit,crystal
73418,erha acne face powder,wardah,erto s
120627,animal face mask deep moisturizing sheet oil control brighten skin for woman panda tiger,bioaqua,ultra jaya
42310,promo smashbox hydrating under eye primer 5ml mini size,benefit,ultra jaya


In [17]:
def column_class_to_text(df, category, column):
    """This function is to convert the dataframe with only one single feature column into text
    This is used in the debugging mode
    """
    print("This function is god damn called")
    map_json = category_json[category]
    column_map = {}
    column_map[column] = {v: k for k, v in map_json[column].items()}
    df.loc[:, column] = df[column].map(column_map[column])
    df.loc[:, column+'_predicted'] = df[column+'_predicted'].map(column_map[column])
    return df

# Competition predict

Comply to competition standard

In [26]:
# For beauty
beauty_feature_columns = ['Brand', 'Colour_group', 'Benefits', 'Product_texture', 'Skin_type']

#df_beauty_compare = pd.DataFrame()
X_test = df_beauty_competition['title']

for column in beauty_feature_columns:
    
    print("Now processing for column:", column)
    df_beauty_val = load_and_predict('beauty',df_beauty_competition, 2, column, X_test)

Now processing for column: Brand
Now processing for column: Colour_group
Now processing for column: Benefits
Now processing for column: Product_texture
Now processing for column: Skin_type


In [30]:
def get_title(df):
    return df['title']

def get_img_column(df):
    return df.loc[:, df.columns.str.contains('^img')]

def get_non_title(df):
    '''Select all other feature columns'''
    return df.loc[:, ~df.columns.str.contains('^img')].drop(columns = ['title'])

In [50]:
# For fashion

fashion_feature_columns = ['Collar Type', 'Sleeves', 'Pattern', 'Fashion Trend', 'Clothing Material']

X_test = df_fashion_competition['title']

for column in fashion_feature_columns:
    print("Now processing for column:", column)
    df_fashion_val = load_and_predict('fashion', df_fashion_competition, 2, column, X_test)

Now processing for column: Collar Type
Now processing for column: Sleeves
Now processing for column: Pattern
Now processing for column: Fashion Trend
Now processing for column: Clothing Material


In [35]:
df_fashion_competition

Unnamed: 0,itemid,title,Pattern,Collar Type,Sleeves,Fashion Trend,Clothing Material,image_path
0,2282553,retro floral dress,2.0,9999.0,9999.0,3.0,9999.0,fashion_image/78d17fdb159bba51a4250dc3d583245e
1,13822218,dress floral sifon,2.0,9999.0,9999.0,9999.0,4.0,fashion_image/2f77dac9965bbfdb03cbd3724b3552c5
2,33555935,korean white chiffon collar dress,9999.0,13.0,9999.0,10.0,4.0,fashion_image/6dbe2e7cba5ddbb750d2144d8f248f11
3,65755120,women s trendy apricot o neck solid chiffon blouse lbed,9999.0,3.0,9999.0,9999.0,4.0,fashion_image/dc9b21429604148fc0342d12694f3294
4,65857438,big sale baju gamis pesta india aysilla pancar maxidress full bordir jersey real pict,9999.0,9999.0,9999.0,6.0,17.0,fashion_image/6c25c578dd8edce742a805f891f1a51f
5,69593037,bayar di tempat bodycon dress tanpa lengan dengan hiasan renda untuk wanita,9999.0,9999.0,0.0,9999.0,3.0,fashion_image/17d11c0e9d7f915dc8ca9b443a336701
6,79488636,gaun mini pesta malam patchwork tanpa punggung lengan panjang leher v ramping seksi fashion wanita,5.0,8.0,3.0,6.0,9999.0,fashion_image/e25977681e7ebfa815efef8087ea8fd7
7,81622997,gaun pesta kasual wanita bergaya tanpa lengan baju siang kotak bodycon paket hip,1.0,3.0,0.0,6.0,7.0,fashion_image/492f80d9bdfd1e412aab0873cf172cec
8,81743165,gaun mini diatas lutut wanita bergaya leher o lengan pendek,9999.0,3.0,2.0,9999.0,7.0,fashion_image/d74f56b21601a41710d72f887d6439b4
9,82512933,gaun pesta cardi velvet emboss payet,9999.0,9999.0,9999.0,6.0,2.0,fashion_image/ee05d8051e800cc4845b5cbfae86474e


In [23]:
# For mobile

mobile_feature_columns = ['Operating System', 'Features',
       'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period',
       'Storage Capacity', 'Color Family', 'Phone Model', 'Camera',
       'Phone Screen Size']

X_test = df_mobile_competition['title']

for column in mobile_feature_columns:
    print("Now processing for column:", column)
    df_mobile_val = load_and_predict('mobile', df_mobile_competition, 2, column, X_test)

Now processing for column: Operating System
Now processing for column: Features
Now processing for column: Network Connections
Now processing for column: Memory RAM
Now processing for column: Brand
Now processing for column: Warranty Period
Now processing for column: Storage Capacity
Now processing for column: Color Family
Now processing for column: Phone Model
Now processing for column: Camera
Now processing for column: Phone Screen Size


In [132]:
df_mobile_val

Unnamed: 0,itemid,title,image_path,Operating System,Features,Network Connections,Memory RAM,Brand,Warranty Period,Storage Capacity,...,Features1_class,Network Connections1_class,Memory RAM1_class,Brand1_class,Warranty Period1_class,Storage Capacity1_class,Color Family1_class,Phone Model1_class,Camera1_class,Phone Screen Size1_class
0,2346660,apple iphone 4s back glass spare part original...,mobile_image/a9c8f0fdd6587deed197634066cf7eee.jpg,1 6,0 2,0 2,9 3,2 18,13 11,12 3,...,0.0,0.0,9.0,2.0,13.0,12.0,12.0,1526.0,5.0,4.0
1,2816338,iphone 4s 64gb white,mobile_image/3b9a11608551b11b9330268e0d055e01.jpg,1 6,2 0,2 0,8 9,2 18,3 13,3 16,...,2.0,2.0,8.0,2.0,3.0,3.0,12.0,1526.0,2.0,4.0
2,2847602,samsung sm b310e piton dual sim,mobile_image/1d719e936841a83c165da620f927de68.jpg,6 1,5 2,2 1,9 2,43 18,13 5,12 4,...,5.0,2.0,9.0,43.0,13.0,12.0,12.0,402.0,6.0,4.0
3,3116949,samsung caramel gt e1272 dual sim 32 mb putih,mobile_image/1d35a74d90df6cf4a02e6a5df9e9ff29.jpg,6 1,5 2,1 3,3 8,43 18,13 5,10 12,...,5.0,1.0,3.0,43.0,13.0,10.0,12.0,1480.0,1.0,4.0
4,3794648,garskin sony experia z z1 z2 ultra,mobile_image/5556577b09539a9c0db0d00e0f171e2d.jpg,6 1,0 6,0 3,6 9,38 18,13 11,12 1,...,0.0,0.0,6.0,38.0,13.0,12.0,10.0,601.0,6.0,5.0
5,4980072,lcd xiaomi redmi 4+touchscreen,mobile_image/504bbab21ede157e6e3f1b93e6b6484c.jpg,6 1,0 1,0 3,5 9,33 19,13 3,12 3,...,0.0,0.0,5.0,33.0,13.0,12.0,10.0,376.0,1.0,0.0
6,5769008,samsung caramel gt e1272 dual sim 32mb black,mobile_image/e088ca5ebb1ab5ba90a8cff8f9c4f791.jpg,6 1,5 2,1 3,3 8,43 18,13 5,10 12,...,5.0,1.0,3.0,43.0,13.0,10.0,10.0,949.0,1.0,4.0
7,9503620,iphone 4g 8gb,mobile_image/a23f0381039e5595559be27db3271d2f.jpg,1 6,0 2,0 3,3 2,2 18,13 5,12 16,...,0.0,0.0,3.0,2.0,13.0,12.0,10.0,177.0,1.0,2.0
8,17937158,blackberry torch 1 9800 gsm garansi distributo...,mobile_image/7803e0e63b5972e14b6ff564679f941c.jpg,6 1,0 5,1 2,5 6,51 2,2 13,12 3,...,0.0,1.0,5.0,51.0,2.0,12.0,12.0,1753.0,2.0,4.0
9,21715801,samsung keystone 3 sm b109e,mobile_image/a5360d928a586de4b7dc5a8463f9fc26.jpg,4 6,5 2,1 3,6 8,43 18,13 5,10 1,...,5.0,1.0,6.0,43.0,13.0,10.0,12.0,559.0,6.0,4.0


In [51]:
# Make submission file

fashion_submit_dict = {}
fashion_submit_dict['id'] = []
fashion_submit_dict['tagging'] = []

beauty_submit_dict = {}
beauty_submit_dict['id'] = []
beauty_submit_dict['tagging'] = []

mobile_submit_dict = {}
mobile_submit_dict['id'] = []
mobile_submit_dict['tagging'] = []

for column in fashion_feature_columns:
    for index, row in df_fashion_val.iterrows():
        fashion_submit_dict['id'].append(str(row['itemid']) + "_" + column)
        fashion_submit_dict['tagging'].append(row[column])

for column in beauty_feature_columns:
    for index, row in df_beauty_val.iterrows():
        beauty_submit_dict['id'].append(str(row['itemid']) + "_" + column)
        beauty_submit_dict['tagging'].append(row[column])

for column in mobile_feature_columns:
    for index, row in df_mobile_val.iterrows():
        mobile_submit_dict['id'].append(str(row['itemid']) + "_" + column)
        mobile_submit_dict['tagging'].append(row[column])

In [52]:
df_fashion_submit = pd.DataFrame(fashion_submit_dict)
df_beauty_submit = pd.DataFrame(beauty_submit_dict)
df_mobile_submit = pd.DataFrame(mobile_submit_dict)

final_submit = df_fashion_submit.append([df_mobile_submit, df_beauty_submit])
final_submit.to_csv('submission_2103_1.csv', index = False)


In [41]:
final_submit.to_csv('./submission_2103.csv', index = False)

In [48]:
df_f 


Unnamed: 0,id,tagging
0,381034175_Collar Type,3 8
1,396355150_Collar Type,1 8
2,592583745_Collar Type,1 4
3,721929368_Collar Type,0 3
4,800794259_Collar Type,3 1
5,959172548_Collar Type,0 6
6,962038495_Collar Type,3 1
7,1029774483_Collar Type,1 8
8,1067777027_Collar Type,1 3
9,1270302483_Collar Type,3 15


In [22]:
df_na['tagging'] = np.nan

In [26]:
df_na.to_csv('testing.csv', index = False)

In [29]:
for cat in categories:
    for column in '{}_feature_columns'.format(cat):
        print (column)

b
e
a
u
t
y
_
f
e
a
t
u
r
e
_
c
o
l
u
m
n
s
f
a
s
h
i
o
n
_
f
e
a
t
u
r
e
_
c
o
l
u
m
n
s
m
o
b
i
l
e
_
f
e
a
t
u
r
e
_
c
o
l
u
m
n
s


In [1]:
import pandas as pd