In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
import pandas as pd
train_df = pd.read_table('../input/mercari/train.tsv')
test_df = pd.read_table('../input/mercari/test.tsv')
print(train_df.shape, test_df.shape)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df = train_df.drop(train_df[(train_df.price < 3.0)].index)
print(train_df.shape)
print(train_df['price'].max())
print(train_df['price'].min())

In [None]:
import matplotlib.pyplot as plt
train_df['price'].hist()

In [None]:
train_df['price'].hist(range=(0, 100))

In [None]:
import numpy as np

# 對訓練資料中的price進行對數變換
train_df["target"] = np.log1p(train_df.price)
# 顯示直方圖
train_df['target'].hist()


In [None]:
def split_cat(text):
    """
    將類別以 / 進行切割
    若無資料時，則傳回 'No Label'
    """
    try: return text.split('/')
    except: return ('No Label', 'No Label', 'No Label')

# 將一分為3的類別名稱登錄至'subcat_0'、'subcat_1'、'subcat_2'
train_df['subcat_0'], train_df['subcat_1'], train_df['subcat_2'] = \
    zip( * train_df['category_name'].apply(lambda x: split_cat(x)))

test_df['subcat_0'], test_df['subcat_1'], test_df['subcat_2'] = \
    zip( * test_df['category_name'].apply(lambda x: split_cat(x)))


In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# 將train_df與test_df結合
full_set = pd.concat([train_df, test_df])
# 從全部資料中找出所有出現的品牌名稱
all_brands = set(full_set['brand_name'].values)

# 將'brand_name'的缺失值 NaN置換為'missing'
train_df['brand_name'].fillna(value='missing', inplace=True)
test_df['brand_name'].fillna(value='missing', inplace=True)

# 取得訓練資料中缺失值的個數
train_premissing = len(train_df.loc[train_df['brand_name'] 
                                    == 'missing'])
# 取得測試資料中缺失值的個數
test_premissing = len(test_df.loc[test_df['brand_name'] 
                                  == 'missing'])

def brandfinder(line):
    
    """
    Parameters: line(str): 品牌名稱
    · 將品牌名稱的'missing'替換為商品名稱：
        當'missing'的商品名稱單詞存在於品牌清單中時
    · 將品牌名稱替換為商品名稱:
        當商品名稱與品牌清單中的名稱完全一致時
    · 維持現有品牌名稱:
        商品名稱與品牌清單的名稱不一致品牌名稱雖為'missing'，但商品名稱的單詞不在品牌清單內
    """
    
    brand = line[0] # 第 1 欄為品牌名稱
    name = line[1]  # 第 2 欄為商品名稱
    namesplit = name.split(' ') # 使用空格分割商品名稱

    if brand == 'missing':  # 是缺失值
        for x in namesplit: # 取出從商品名稱分割出來的單詞
            if x in all_brands:
                return name # 商品名稱單詞存在於品牌清單中，則傳回商品名稱單詞
    if name in all_brands:  # 不是缺失值
        return name         # 商品名稱若存在於品牌清單中，則傳回商品名稱

    return brand # 都沒有一致的話就傳回品牌名稱

# 更換品牌名稱
train_df['brand_name'] = train_df[['brand_name',
                                   'name']].apply(brandfinder, 
                                                  axis = 1)
test_df['brand_name'] = test_df[['brand_name',
                                 'name']].apply(brandfinder, 
                                                axis = 1)

# 取得改寫後的缺失值數量
train_len = len(train_df.loc[train_df['brand_name'] == 'missing'])
test_len = len(test_df.loc[test_df['brand_name'] == 'missing'])
train_found = train_premissing - train_len
test_found = test_premissing - test_len
print(train_premissing) # 改寫前訓練資料的缺失值數量
print(train_found)      # 改寫後訓練資料的缺失值數量
print(test_premissing)  # 改寫前測試資料的缺失值數量
print(test_found)       # 改寫後測試資料的缺失值數量


In [None]:
train_df.head()

In [None]:
full_df = pd.concat([train_df, test_df], sort=False)

In [None]:
def fill_missing_values(df):
    # 商品類別
    df.category_name.fillna(value='missing', inplace=True)
    # 品牌名稱
    df.brand_name.fillna(value='missing', inplace=True)
    # 商品敘述
    df.item_description.fillna(value='missing', inplace=True)
    # 將敘述中的 'No description yet' 改為 'missing' 
    df.item_description.replace('No description yet',
                                'missing',
                                inplace=True)
    return df

full_df = fill_missing_values(full_df)

In [None]:
from sklearn.preprocessing import LabelEncoder

print("Processing categorical data...")

# 建立LabelEncoder
le = LabelEncoder()
# 對'category_name'進行編碼、登錄至'category'欄位
le.fit(full_df.category_name)
full_df['category'] = le.transform(full_df.category_name)
# 'brand_name'編碼
le.fit(full_df.brand_name)
full_df.brand_name = le.transform(full_df.brand_name)
# 'subcat_0'編碼
le.fit(full_df.subcat_0)
full_df.subcat_0 = le.transform(full_df.subcat_0)
# 'subcat_1'編碼
le.fit(full_df.subcat_1)
full_df.subcat_1 = le.transform(full_df.subcat_1)
# 'subcat_2'編碼
le.fit(full_df.subcat_2)
full_df.subcat_2 = le.transform(full_df.subcat_2)
del le

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

# 將商品敘述、商品名稱、商品類別如下連接成一為陣列
# [商品敘述1,商品敘述2, ...,商品名稱1,商品名稱2,...,商品類別,商品類別,...]

print("Transforming text data to sequences...")
raw_text = np.hstack([full_df.item_description.str.lower(), # 商品敘述
                      full_df.name.str.lower(),             # 商品名稱
                      full_df.category_name.str.lower()])   # 商品類別
print('sequences shape', raw_text.shape)

# 建立Tokenizer 
print(" Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

# 使用Tokenizer對商品敘述、商品名稱分別進行標籤編碼
print(" Transforming text to sequences...")
full_df['seq_item_description'] = tok_raw.texts_to_sequences(
    full_df.item_description.str.lower())
full_df['seq_name'] = tok_raw.texts_to_sequences(
    full_df.name.str.lower())

del tok_raw

print(full_df.seq_item_description.head())
print(full_df.seq_name.head())


In [None]:
from keras.preprocessing.sequence import pad_sequences
print(pad_sequences(full_df.seq_item_description, maxlen=80),'\n') # 商品敘述
print(pad_sequences(full_df.seq_name, maxlen=10))                  # 商品名稱
