In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import math
from subprocess import check_output
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.cross_validation import train_test_split



In [2]:
# Read data
train = pd.read_table("./data/train.tsv")
test = pd.read_table("./data/test.tsv")
print(train.shape)
print(test.shape)

(1482535, 8)
(693359, 7)


In [3]:
train.head(2)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...


In [4]:
test.head(2)

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."


In [5]:

submission = pd.read_csv("./data/sample_submission.csv")
submission.head(3)

Unnamed: 0,test_id,price
0,0,26.738
1,1,26.738
2,2,26.738


In [6]:
# Data imputation
def dat_impute(dat):
    for column in dat:
        dat[column].fillna(value="missing", inplace=True)
        
dat_impute(train)
dat_impute(test)
train.head(2)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...


In [7]:
test.dtypes

test_id               int64
name                 object
item_condition_id     int64
category_name        object
brand_name           object
shipping              int64
item_description     object
dtype: object

In [8]:
train.head(2)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,missing,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...


In [10]:
# Categorical data encoding
def dat_encoder(train, test):
    for column in ['category_name', 'brand_name']:
        le = LabelEncoder()
        le.fit(np.hstack([train[column], test[column]]))
        train[column] = le.transform(train[column])
        test[column] = le.transform(test[column])
        del le

dat_encoder(train,test)

In [11]:
train.head(2)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,829,5265,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,86,3889,52.0,0,This keyboard is in great condition and works ...


In [12]:
# Text mining - Raw features
from keras.preprocessing.text import Tokenizer
raw_text = np.hstack([train.item_description.str.lower(), train.name.str.lower()])
raw_text 

Using TensorFlow backend.


array(['no description yet',
       'this keyboard is in great condition and works like it came out of the box. all of the ports are tested and work perfectly. the lights are customizable via the razer synapse app on your pc.',
       'adorable top with a hint of lace and a key hole in the back! the pale pink is a 1x, and i also have a 3x available in white!',
       ..., '21 day fix containers and eating plan',
       'world markets lanterns', 'brand new lux de ville wallet'], dtype=object)

In [14]:
# Tokenizer
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)
tok_raw

<keras.preprocessing.text.Tokenizer at 0x1b1fbb49ba8>