In [7]:
import pandas as pd
import numpy as np
import pickle

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
df_products = pd.read_json(f'../../../data/processed/products.json.gz', orient="records", compression="gzip")

In [6]:
df_products.head()

Unnamed: 0,keywords,categories
0,sony mdr monitor series headphone swivel earcu...,[headphones]
1,fujifilm digital camera optical zoom bundle fu...,[cameras]
2,sony mvc mavica digital camera optical zoom ca...,[cameras]
3,plantronics duoset convertible headset discont...,[headphones]
4,fujifilm finepix digital camera optical zoom f...,[cameras]


In [None]:
# Define the features
tfidf = TfidfVectorizer(stop_words='english')
x_desc = tfidf.fit_transform(df_products['keywords'])

mlb = MultiLabelBinarizer()
x_cat = mlb.fit_transform(df_products['categories'])

X = np.hstack([x_desc, x_cat])

In [None]:
np.save('feature_matrix.npy', X)
pickle.dump(tfidf, open('./model/logit_model.pkl', 'wb'))
pickle.dump(mlb, open('./model/word_vectorizer.pkl','wb'))

In [None]:
# Perform matrix factorization
model = NMF(n_components=10, init='random', random_state=42)
W = model.fit_transform(X)
H = model.components_

# Recommend items based on item preferences
item_preferences = H.T[0]
item_scores = W.dot(item_preferences)
recommended_items = item_scores.argsort()[::-1][:10]