# Fipkart-products Recommendation Systems
## Content-Based Recommender Systems
## NearestNeighbors (TOP-3)

In [None]:
import numpy as np
import pandas as pd
import sklearn
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

products = pd.read_csv('data/flipkart_com-ecommerce_sample.csv')
products = products[['uniq_id', 'description', 'discounted_price']].dropna()


target = products.iloc[:,0].values

In [None]:
products['discounted_price'].hist(bins=4, range=[0, 20000])

In [None]:
count_vector = CountVectorizer()
count_array = count_vector.fit_transform(products['description'])

tfidf        = TfidfTransformer()
tfidf_vector = tfidf.fit_transform(count_array)
tfitf_array  = tfidf_vector.toarray()

from sklearn import preprocessing
scaler = preprocessing.Normalizer().fit(products['discounted_price'])
new_retail_price = scaler.transform(products['discounted_price'])

new_feature = np.concatenate([tfitf_array, np.reshape(new_retail_price, [-1, 1])], axis=1)

In [None]:
new_feature.shape

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(new_feature) 

## Save file

In [None]:
#import pickle
#pickle.dump(count_vector, open('data/count_vector.p', 'wb'))
#pickle.dump(tfidf, open('data/tfidf.p', 'wb'))
#pickle.dump(scaler, open('data/scaler.p', 'wb'))

In [None]:
#from sklearn.externals import joblib
#joblib.dump(neigh, 'data/current_model.pkl')

## Predict with keyword + price

In [None]:
keyword = 'Black shirt'
price = 500.0

In [None]:
p_count_array = count_vector.transform([keyword])
p_tfidf_vector = tfidf.transform(p_count_array)
p_tfitf_array  = p_tfidf_vector.toarray()

p_new_price = scaler.transform([price])
p_new_feature = np.concatenate([p_tfitf_array, np.reshape([p_new_price], [-1, 1])], axis=1)

In [None]:
p_data = scaler.transform(p_new_feature) 

In [None]:
distance, best_n = neigh.kneighbors(p_data, return_distance=True) 
distance

In [None]:
best_target = []
for n in best_n:
    best_target.append(target[n])

best_target = list(best_target)
best_target

In [None]:
products[products['uniq_id'] == best_target[0][0]]

In [None]:
products[products['uniq_id'] == best_target[0][1]]

In [None]:
products[products['uniq_id'] == best_target[0][2]]

## Test split

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
kf.get_n_splits(new_feature)

for train_index, test_index in kf.split(new_feature):
    X_train, X_test = new_feature[train_index], new_feature[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
X_test = X_test[:1000]
y_test = y_test[:1000]

In [None]:
idx = 0
accuracy  = 0
for testX in X_test:
    distance, best_n = neigh.kneighbors(testX, return_distance=True) 
    checked = y_test[idx] in [target[best_n[0][0]], target[best_n[0][1]], target[best_n[0][2]]]
    accuracy += 1 if checked else 0
    idx += 1

In [None]:
print('accuracy=%s' % (accuracy*100/len(X_test)))