# Project Name: Smart Suggest

In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors



### Read Data

In [2]:
df = pd.read_csv("ecommerce/data.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


### Preprocess Data

In [3]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [4]:
df.shape

(541909, 8)

In [5]:
df = df.dropna(subset=['CustomerID', 'StockCode', 'Quantity', 'UnitPrice'])

In [6]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [7]:
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [8]:
df.shape

(406829, 8)

In [9]:
aggregated_df = df.groupby(['CustomerID', 'StockCode'])['Quantity'].sum().reset_index()
aggregated_df

Unnamed: 0,CustomerID,StockCode,Quantity
0,12346.0,23166,0
1,12347.0,16008,24
2,12347.0,17021,36
3,12347.0,20665,6
4,12347.0,20719,40
...,...,...,...
267610,18287.0,84920,4
267611,18287.0,85039A,96
267612,18287.0,85039B,120
267613,18287.0,85040A,48


In [10]:
user_item_matrix = aggregated_df.pivot(index='CustomerID', columns='StockCode', values='Quantity').fillna(0)

user_item_matrix

StockCode,10002,10080,10120,10123C,10124A,10124G,10125,10133,10135,11001,...,90214Y,90214Z,BANK CHARGES,C2,CRUK,D,DOT,M,PADS,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12347.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
12349.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18281.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18282.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18283.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


### Build Recommendation Model

In [11]:
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(user_item_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

### Evaluate the Model

In [12]:
def get_recommendation(user_id, num_recommendations=5):
    # Get the user's row from matrix
    user_index = user_item_matrix.index.get_loc(user_id)
    user_vector = user_item_matrix.iloc[user_index].values.reshape(1, -1)
    
    # Find similar users
    distances, indices = model.kneighbors(user_vector, n_neighbors=num_recommendations + 1)
    
    # Get the products IDs for recommendation
    # Skip the first index as it is the user itself
    similar_user_indices = indices.flatten()[1:]
    print("Similar User Indices:", similar_user_indices)
    print("Length of User Indices:", len(similar_user_indices))
    recommendations = []
    for idx in similar_user_indices:
        recommended_products = user_item_matrix.iloc[idx].nlargest(num_recommendations).index.tolist()
        recommendations.extend(recommended_products)
        
    # Remove the duplicates and products that the user has already purchased
    recommendations = list(set(recommendations) - set(user_item_matrix.columns[user_vector[0] > 0]))
    return recommendations[:num_recommendations]


# Example: Get recommendations for the sample user
sample_user_id = df['CustomerID'].iloc[0]
recommendations = get_recommendation(sample_user_id)
print(f"\nRecommendations for user {sample_user_id}: {recommendations}")

Similar User Indices: [ 625 1107 1547 1380 3879]
Length of User Indices: 5

Recommendations for user 17850.0: ['22789', '21754', '10123C', '84077', '23331']




### Recommendation System Primary Columns

In the recommendation system, the primary columns used from the dataset to make recommendations are:

1): CustomerID: This column uniquely identifies each customer. It is used as the index in the user-item matrix to represent different users.

2): StockCode: This column uniquely identifies each product. It is used as the columns in the user-item matrix to represent different products.

3): Quantity: This column represents the quantity of each product purchased by a customer. It is used as the values in the user-item matrix to indicate the strength of interaction between a user and a product.

### Why These Columns?

CustomerID and StockCode: These columns are essential for creating a user-item interaction matrix, which is a fundamental structure in collaborative filtering. The matrix captures the relationship between users and products, enabling the system to identify patterns and similarities in purchasing behavior.

Quantity: The quantity of products purchased is used as a proxy for user preference or interest. A higher quantity purchased might indicate a stronger preference for that product. This information helps in identifying which products are likely to be of interest to similar users.

# THE END