# Joshua Kobuskie

In [1]:
#INSTALL JAX AND OTHER LIBRARIES
!pip install jax
import jax
import pandas as pd
import numpy as np
import gc
import sys

In [2]:
def obj_size_fmt(num):
    if num<10**3:
        return "{:.2f}{}".format(num,"B")
    elif ((num>=10**3)&(num<10**6)):
        return "{:.2f}{}".format(num/(1.024*10**3),"KB")
    elif ((num>=10**6)&(num<10**9)):
        return "{:.2f}{}".format(num/(1.024*10**6),"MB")
    else:
        return "{:.2f}{}".format(num/(1.024*10**9),"GB")


def memory_usage():
    memory_usage_by_variable=pd.DataFrame({k:sys.getsizeof(v)\
    for (k,v) in globals().items()},index=['Size'])
    memory_usage_by_variable=memory_usage_by_variable.T
    memory_usage_by_variable=memory_usage_by_variable.sort_values(by='Size',ascending=False).head(10)
    memory_usage_by_variable['Size']=memory_usage_by_variable['Size'].apply(lambda x: obj_size_fmt(x))
    return memory_usage_by_variable


In [3]:
#Read in customer data
customers = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv')

#Fill in columns with missing values and convert strings to weights
customers.loc[customers['FN'].isnull(),'FN'] = float(0.0)
customers.loc[customers['Active'].isnull(),'Active'] = float(0.0)


customers.loc[customers['fashion_news_frequency']=='NONE','fashion_news_frequency'] = float(0.0)
customers.loc[customers['fashion_news_frequency']=='None','fashion_news_frequency'] = float(0.0)
customers.loc[customers['fashion_news_frequency'].isnull(),'fashion_news_frequency'] = float(0.0)
customers.loc[customers['fashion_news_frequency']=='Regularly','fashion_news_frequency'] = float(0.5)
customers.loc[customers['fashion_news_frequency']=='Monthly','fashion_news_frequency'] = float(1.0)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].astype('float64')


customers.loc[customers['club_member_status']=='LEFT CLUB','club_member_status'] = float(0.0)
customers.loc[customers['club_member_status'].isnull(),'club_member_status'] = float(0.0)
customers.loc[customers['club_member_status']=='PRE-CREATE','club_member_status'] = float(0.5)
customers.loc[customers['club_member_status']=='ACTIVE','club_member_status'] = float(1.0)
customers['club_member_status'] = customers['club_member_status'].astype('float64')

#Assume average age if none given
average_age = customers['age'].mean()
customers.loc[customers['age'].isnull(),'age'] = average_age

#Remove postal code
customers.drop(columns='postal_code', axis = 1, inplace = True)

customers['customer_id'] = customers['customer_id'].astype('string')

#Display results
customers.info()

In [4]:
#read in transactions and categorize years and months
transactions = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv')
transactions['year'] = pd.DatetimeIndex(transactions['t_dat']).year
transactions['month'] = pd.DatetimeIndex(transactions['t_dat']).month
transactions = transactions[transactions['year'] >= 2020]
transactions['t_dat'] = transactions['t_dat'].astype('string')
transactions['customer_id'] = transactions['customer_id'].astype('string')
transactions.info()
transactions.head()

In [5]:
#read in articles
from sklearn.preprocessing import LabelEncoder
articles = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv')
labelencoder = LabelEncoder()
#convert letters to numbers for groups
articles['product_group_name'] = labelencoder.fit_transform(articles['product_group_name'])
articles['index_code'] = labelencoder.fit_transform(articles['index_code'])
#select only numeric groups
articles = articles[['article_id', 'product_code', 'product_type_no', 'product_group_name', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id', 'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no']]
articles.info()
articles.head(10)

In [6]:
#data = pd.merge(customers, transactions, how="left", on=["customer_id", "customer_id"])
#data = pd.merge(data, articles, how="left", on=["article_id", "article_id"])
#data = customers.merge(transactions, how='left', left_on='customer_id', right_on='customer_id').merge(articles, how='left', left_on='article_id', right_on='article_id')
data = transactions.merge(customers,on='customer_id').merge(articles,on='article_id')
data['ID'] = data.index
del transactions
#del customers
#saving customers for later to merge onto predictions
del articles
gc.collect()
memory_usage()

In [7]:
data.info()

In [8]:
data.head(10)

In [9]:
#elimate null values while maintaining customer count
#data.loc[data['t_dat'].isnull(),'t_dat'] = '00-00-0000'
#data.loc[data['article_id'].isnull(),'article_id'] = float(0.0)
#data.loc[data['price'].isnull(),'price'] = float(0.0)
#data.loc[data['sales_channel_id'].isnull(),'sales_channel_id'] = float(0.0)
#data.loc[data['year'].isnull(),'year'] = float(0.0)
#data.loc[data['month'].isnull(),'month'] = float(0.0)
#data.loc[data['product_code'].isnull(),'product_code'] = float(0.0)
#data.loc[data['product_type_no'].isnull(),'product_type_no'] = float(0.0)
#data.loc[data['product_group_name'].isnull(),'product_group_name'] = float(0.0)
#data.loc[data['graphical_appearance_no'].isnull(),'graphical_appearance_no'] = float(0.0)
#data.loc[data['colour_group_code'].isnull(),'colour_group_code'] = float(0.0)
#data.loc[data['perceived_colour_value_id'].isnull(),'perceived_colour_value_id'] = float(0.0)
#data.loc[data['perceived_colour_master_id'].isnull(),'perceived_colour_master_id'] = float(0.0)
#data.loc[data['department_no'].isnull(),'department_no'] = float(0.0)
#data.loc[data['index_code'].isnull(),'index_code'] = float(0.0)
#data.loc[data['index_group_no'].isnull(),'index_group_no'] = float(0.0)
#data.loc[data['section_no'].isnull(),'section_no'] = float(0.0)
#data.loc[data['garment_group_no'].isnull(),'garment_group_no'] = float(0.0)
#Turns out, this was a bad idea. The model predicts 0 all the time and becomes horribly inaccurate.

data.isnull().sum()

In [10]:
#JAX FOR RANDOM SAMPLING
import jax.numpy as jnp
from jax import random
#tried using jax_sampling methods and ran into many issues, switched to jax random generation

#Using JAX to create a random array of indexes and use that to take a sample out of the data to pass to X
seed = 1234
key = jax.random.PRNGKey(seed)
#SAMPLE SIZE
#optimized for accuracy vs time
shape = (25000,)
index_rand = jax.random.randint(key, shape, 0, len(data)-1)
index_rand = np.array(index_rand)

#JAX version
x = data.iloc[index_rand, :].copy()
y = x['article_id']
#Dropping non-numerical data for later
#going to rejoin data to data_num based on ID
x.drop(columns=['article_id', 't_dat', 'customer_id'], axis = 1, inplace = True)
indexes = data[['article_id', 't_dat', 'customer_id', 'ID']]
data.drop(columns=['article_id', 't_dat', 'customer_id'], axis = 1, inplace = True)

In [11]:
# split the dataset into the training set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
del x
del y 
gc.collect()
memory_usage()

In [12]:
from sklearn.neighbors import KNeighborsClassifier
x_train.info()

In [13]:
k = 1
memory_usage()
#Train Model and Predict  
neighbors = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train)

In [14]:
yhat = neighbors.predict(x_test)
yhat[0:5]

In [15]:
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neighbors.predict(x_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

In [16]:
x_train.info()
data.info()

In [17]:
result_data = neighbors.predict(data)

In [18]:
len(result_data)
len(data)

In [19]:
data = data.merge(indexes, on='ID')
data['article'] = result_data.tolist()

In [20]:
data_ndg = data.drop_duplicates(subset = ['customer_id','article'])
data_ngd = data_ndg.groupby('customer_id')
data_ndg.head(10)

In [21]:
data_ndg['customer_id'].value_counts()

In [22]:
data_ndg.head(10)

In [23]:
result_data=data_ndg[['customer_id','article', 'ID']]
result_data

In [24]:
result_data['customer_id'].value_counts()

In [25]:
result_final = result_data.groupby('customer_id').sum().reset_index()
result_final = result_final.merge(customers,how='right', on='customer_id')
result_final = result_final[['customer_id', 'article']]

In [26]:
result_final.columns = ['customer_id','prediction']
#Guess most popular product for every other customer with no transaction history
most_popular_product = result_final['prediction'].mode()
result_final.loc[result_final['prediction'].isnull(),'prediction'] = int(most_popular_product)

#pad with 0 and convert to string for article_ids
result_final['prediction'] = result_final['prediction'].astype('int')
result_final['prediction'] = result_final['prediction'].astype('string')
result_final['prediction'] = result_final['prediction'].apply(lambda x: x.zfill(10))

result_final['prediction'] = result_final['prediction'].astype('string')
result_final['customer_id'] = result_final['customer_id'].astype('string')
result_final.head(20)

In [28]:
#check for null values
result_final.isnull().sum()

In [27]:
import reprlib
submit = result_final.to_csv(index=False)
#limit output displayed
print(reprlib.repr(submit))