In [None]:
import numpy as np
import pandas as pd 
pd.options.plotting.backend = "matplotlib"
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import gc
import cudf
from fastai.tabular.core import add_datepart #fails because of some issue with weeks in cudf?
import cupy as cp
from cuml.cluster import KMeans
from cuml.datasets import make_blobs


In [None]:
transactions = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', parse_dates=['t_dat'])
customers = cudf.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv')
customers['user_id'] = range(len(customers))
transactions = transactions.merge(customers[['customer_id', 'user_id']], on='customer_id')
transactions['article_id'] = transactions.article_id.astype('int32')
transactions.t_dat = cudf.to_datetime(transactions.t_dat)
transactions = transactions[['t_dat','user_id','article_id']]
print( transactions.shape )
transactions.head()

In [None]:
tmp = transactions.groupby(['user_id','article_id'])['t_dat'].agg('count').reset_index()
tmp.columns = ['user_id','article_id','ct']
tmp.tail()

In [None]:
transactions = transactions.merge(tmp,on=['user_id','article_id'],how='left')
transactions = transactions.sort_values(['ct','t_dat'],ascending=False)
transactions = transactions.drop_duplicates(['user_id','article_id'])
transactions = transactions.sort_values(['ct','t_dat'],ascending=False)
transactions.tail()

In [None]:
transactions['year'] = transactions['t_dat'].dt.year
transactions['month'] = transactions['t_dat'].dt.month
transactions['day'] = transactions['t_dat'].dt.day
transactions['dayofweek'] = transactions['t_dat'].dt.dayofweek
transactions['dayofyear'] = transactions['t_dat'].dt.dayofyear
transactions['is_month_end'] = transactions['t_dat'].dt.is_month_end
transactions['is_month_start'] = transactions['t_dat'].dt.is_month_start
transactions.drop(columns=['t_dat'], inplace = True)

transactions.tail()

In [None]:
transactions['cust_cat']= transactions['user_id'].astype('category')
transactions['cat_codes'] = transactions['cust_cat'].cat.codes 
cust_cat_df = transactions[['user_id', 'cust_cat', 'cat_codes']] #save them to put them back together later
print(cust_cat_df.dtypes)
cust_cat_df.head()

In [None]:
transactions.drop(columns=['cust_cat', 'cat_codes'], inplace = True)
transactions.dtypes

In [None]:
customers_id_user_id = customers[['customer_id', 'user_id']]
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('None')
customers['club_member_status'] = customers['club_member_status'].fillna('None')
customers.shape, customers.dtypes
customers

In [None]:
customers.dtypes

In [None]:
customers_id_user_id

In [None]:
customers.drop(columns='customer_id', inplace = True)
cat_names= customers.select_dtypes(include=['object']).columns
cont_names = customers.select_dtypes(include=['int64']).columns
obj_names = customers.select_dtypes(include=['object']).columns

for i in cat_names: customers[i+'_cat']=customers[i].astype('category')
for i in obj_names: customers.drop(columns=[i], inplace = True)

customers.dtypes

In [None]:
customers_id_user_id['user_id'] = customers_id_user_id['user_id'].astype('float64')
customers_id_user_id.dtypes

In [None]:
times_bought = transactions[['user_id', 'ct']]
times_bought = times_bought.groupby('user_id', as_index = False).sum()
times_bought.head()

In [None]:
customers = customers.merge(times_bought,  how='left', on='user_id')
customers['ct'] = customers['ct'].fillna(0)
customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)
customers['age'] = customers['age'].fillna(0)

customers.head()

In [None]:
cat_names= customers.select_dtypes(include=['category']).columns
customers_cat_df = cudf.DataFrame()

for i in cat_names: 
    customers[i+'_cat_code'] = customers[i].cat.codes
    
    #save them to put them back together later
    customers_cat_df[i] = customers[i]    
    customers_cat_df[i+'cat_code'] =customers[i+'_cat_code']
    


In [None]:
customers_cat_df

In [None]:
for i in cat_names:
    customers.drop(columns=[i], inplace = True)
customers.dtypes

In [None]:
int64s = customers.select_dtypes(include=['int64']).columns
for i in int64s:
    customers[i] = customers[i].astype(float)
customers.dtypes

In [None]:
cols_list = customers.columns
cols_list = cols_list.to_list()
cols_list.remove('ct')

In [None]:
X = customers[cols_list].to_pandas()
y = customers['ct'].to_pandas()

In [None]:
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot

model = RandomForestRegressor()
model.fit(X, y)

In [None]:
fi_plot_df = pd.DataFrame({'cols':X.columns, 'imp':model.feature_importances_}).sort_values('imp', ascending=False)    
fi_plot_df.plot(kind="barh", x = 'cols')

In [None]:
customers = customers[['user_id', 'postal_code_cat_cat_code', 'age', 'club_member_status_cat_cat_code', 'Active', 'ct']]

In [None]:
Sum_of_squared_distances = []
K = range(1, 10)
for num_clusters in K :
 kmeans = KMeans(n_clusters=num_clusters)
 kmeans.fit(customers)
 Sum_of_squared_distances.append(kmeans.inertia_)
plt.plot(K,Sum_of_squared_distances,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Sum of squared distances/Inertia') 
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
kmeans_float = KMeans(n_clusters=3)
kmeans_fit = kmeans_float.fit(customers)

In [None]:
print("labels:")
print(kmeans_float.labels_)
print("cluster_centers:")
print(kmeans_float.cluster_centers_)

In [None]:
kmeans_float.fit_predict(customers)

In [None]:
labels = kmeans_float.labels_
customers['clusters'] = labels

In [None]:
customers.clusters.value_counts()

In [None]:
customers.tail()

In [None]:
customers_id_user_id['user_id']
customers = customers.merge(customers_id_user_id, on='user_id')
customers.head()

In [None]:
customers.drop(columns='user_id', inplace = True)
customers.head()

In [None]:
customers.to_csv('customers_clustered.csv', index=False)