In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import copy
import util

# Transform data
parser = lambda date: pd.to_datetime(date)
df = dataset = pd.read_excel('dataset.xlsx')

# Parse date time
df['Date'] = df['Date'].apply(parser)
df['transaction_date'] = df['Date'].dt.date
df['is_weekend'] = df['Date'].dt.dayofweek.apply(util.is_weekend)
df['section_time'] = df['Date'].dt.hour.apply(util.quarter_time)

# Parse continous value
df['bin_price'] = util.discretize(df['Price'], np.array([1000, 2000, 3000, 4000, 5000, 6000]))
df['bin_discount'] = util.discretize(df['Discount'], np.array([100, 200, 300, 500]))
df['bin_net'] = util.discretize(df['NMV'], np.array([900, 1800, 2700, 3500, 5000, 6000]))

# Parse categories
df = df[pd.notnull(df['StuffCategories'])] #  clear null StuffCategories, cause it has only 8 rows
data_without_null = copy.copy(df)
df['bought_categories'] = df['StuffCategories'].str.split(',')
df['bought_categories'] = df['bought_categories'].apply(util.unique_cat)

parent_categories = ['lifestyle', 'men_fashion', 'women_fashion']
bought_categories = [
    'all', 'lifestyle_crafts', 'lifestyle_for_home', 'lifestyle_gadget_tech', 
    'lifestyle_gift_ideas', 'lifestyle_other', 'lifestyle_phone_accessories', 'lifestyle_sports', 
    'lifestyle_stationeries', 'men_bags_wallets', 'men_glasses', 
    'men_hats', 'men_jewelry', 'men_pants', 'men_shoes', 'men_shorts', 'men_tops', 
    'men_underwear', 'men_watches', 'women_bags_wallets', 'women_dresses', 
    'women_glasses', 'women_hats', 'women_jackets_blazers', 
    'women_jewelry', 'women_other', 'women_pants_leggings', 'women_shoes', 
    'women_shorts', 'women_skirts', 'women_sports', 'women_swimwear', 'women_tops', 
    'women_watches', 'lifestyle', 'men_fashion', 'women_fashion'
]
df_cat = pd.DataFrame(index=None, columns=bought_categories)
df = pd.concat([df, df_cat]).fillna(0)
df = df.apply(util.word_matrix, axis=1)

# Drop all unused columns
data_without_null = data_without_null.drop('bought_categories', axis=1)
df = df.drop('Date', axis=1)
df = df.drop('StuffCategories', axis=1)
df = df.drop('bought_categories', axis=1)
df = df.drop('transaction_date', axis=1)
df = df.drop('Price', axis=1)
df = df.drop('Discount', axis=1)
df = df.drop('NMV', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [28]:
# Predict

# cross-validate 10 time, 90/10
# to prevent bias in model
result_set = []
for step in range(0, 10):
    
    # Split train test each round
    expected_test_size = 10
    test_percent = ((step + 1) * expected_test_size) / 100
    test_data_num = round(len(df) * test_percent)
    train_data_num = round(len(df) - test_data_num)
    train_data = np.array(df[:test_data_num])
    test_data = np.array(df[train_data_num:len(df)])
    
    # Predict
    kmeans = KMeans(n_clusters=len(bought_categories)).fit(train_data)
    kmeans.predict(test_data)
    
    # Map result to labels
    result = []
    labels = kmeans.labels_
    for label in labels:
        result = np.append(result, bought_categories[label])
    df_result = pd.DataFrame(result)
    df_predicted = data_without_null[train_data_num:len(data_without_null)]
    dataset_suggested = pd.concat([df_predicted.reset_index(drop=True), df_result], axis=1)
    np.append(result_set, dataset_suggested)

# Write to output file
dataset_suggested.to_csv('output.csv')
