In [60]:
import sys
sys.path.insert(0, '../../../')
from src.utils.preprocessing import to_lower
from src.utils.preprocessing import remove_html_tag
from src.utils.preprocessing import remove_url
from src.utils.preprocessing import remove_punctuation
from src.utils.preprocessing import remove_stopword
from src.utils.preprocessing import remove_n_chars
from src.utils.preprocessing import lemmatize_word

import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
df_products = pd.read_json(f'../../../data/processed/products.json.gz', orient="records", compression="gzip")
df_products.head()

Unnamed: 0,product_id,keywords
0,B00001W0DG,sony mdrv500dj monitor series headphone swivel...
1,B00004TLW2,fujifilm mx2900 23mp digital camera 3x optical...
2,B00004VUM1,sony mvcfd95 mavica 2mp digital camera 10x opt...
3,B00004WFYN,plantronics h141 duoset convertible headset di...
4,B00004XSHN,fujifilm finepix 4900 43mp digital camera 6x o...


In [62]:
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.99, 
    min_df=1, 
    ngram_range=(1, 2), 
    sublinear_tf=True
)
item_feature_matrix = tfidf_vectorizer.fit_transform(df_products['keywords'])
item_similarity_matrix = cosine_similarity(item_feature_matrix)

In [63]:
def preprocessor(text: str) -> str:
    text = to_lower(text)
    text = remove_html_tag(text)
    text = remove_url(text)
    text = remove_punctuation(text)
    text = remove_stopword(text)
    text = remove_n_chars(text, 1)
    text = lemmatize_word(text)
    return text

tfidf_vectorizer.set_params(preprocessor=preprocessor)

In [64]:
pickle.dump(tfidf_vectorizer, open('../../../models/content_based_filtering/tfidf_vectorizer.pkl', 'wb'))
pickle.dump(item_similarity_matrix, open('../../../models/content_based_filtering/item_similarity_matrix.pkl', 'wb'))