In [1]:
import pandas as pd
from ml_metrics import mapk
from config import RAW_DATA

INFO | APP_DIR set to .


In [2]:
raw_transactions = pd.read_csv(RAW_DATA)

In [3]:
N_ROWS = raw_transactions.shape[0]
raw_transactions.shape

(1215193, 8)

In [4]:
raw_transactions.dtypes

customer_id              int64
item_text               object
brand_name              object
cat1                    object
cat2                    object
cat3                    object
global_product_id      float64
global_product_name     object
dtype: object

In [5]:
raw_transactions.head()

Unnamed: 0,customer_id,item_text,brand_name,cat1,cat2,cat3,global_product_id,global_product_name
0,16899288,SOUTHSIDE JAL CHDR SAUSAG,oscar mayer,meat seafood poultry,hot dogs,hot dogs,749534.0,hot dogs
1,10356736,PREMIUM BLACK PLUM,generic produce,produce,fruits,plumspluots,154803.0,fresh plums
2,13867598,GV DRESSING,great value,condiments sauces seasonings,salad dressings,salad dressings,744331.0,great value thousand island dressing
3,566298,BARILLA RP ELBOWS,barilla,grains pasta,pasta,pasta,,
4,5381591,,generic ice,frozen foods,ice,ice,148374.0,ice


In [6]:
def describe_features(df):
    for column_name in df.columns:
        print(column_name.upper())
        print('Unique Values: {:,}'.format(len(df[column_name].unique())))
        print('Missing values: {}%'.format(round((df[column_name].isna().sum()/N_ROWS)*100, 2)))
        print('Modal Value: {}'.format(df[column_name].mode()[0]))
        print('\n')

In [7]:
describe_features(raw_transactions)

CUSTOMER_ID
Unique Values: 6,996
Missing values: 0.0%
Modal Value: 16091073


ITEM_TEXT
Unique Values: 237,636
Missing values: 34.63%
Modal Value: \N


BRAND_NAME
Unique Values: 6,547
Missing values: 12.47%
Modal Value: generic produce


CAT1
Unique Values: 53
Missing values: 17.55%
Modal Value: produce


CAT2
Unique Values: 250
Missing values: 18.93%
Modal Value: fruits


CAT3
Unique Values: 446
Missing values: 19.27%
Modal Value: bread


GLOBAL_PRODUCT_ID
Unique Values: 90,543
Missing values: 25.71%
Modal Value: 934387.0


GLOBAL_PRODUCT_NAME
Unique Values: 76,149
Missing values: 25.75%
Modal Value: no info




In [8]:
raw_transactions = raw_transactions.dropna(subset=['global_product_id'], axis=0)
raw_transactions['global_product_id'] = raw_transactions['global_product_id'].astype(int)

### Examining `customer_id` 14687389

In [9]:
customer_purchases = raw_transactions.loc[raw_transactions['customer_id'] == 14687389]

In [10]:
observed_purchases = customer_purchases['global_product_id'].value_counts().index
observed_purchases = [int(product_id) for product_id in observed_purchases]

In [12]:
predictions = [934387, 156309, 156322, 156329, 155230, 148407, 148695, 148680, 152459, 148789]

In [13]:
pred_list = [predictions]
observed_list = [observed_purchases]
new_products = list(set(predictions).difference(set(observed_purchases)))
new_products

[148680, 152459, 148789, 148407]

In [14]:
mean_avg_precision = mapk(observed_list, pred_list)
print(f"Mean Average Precision: {mean_avg_precision:.2f}")
print(f"Observed vs. Recommendations:")
for a,p in zip(observed_purchases, predictions):
    print(a, p)

Mean Average Precision: 0.59
Observed vs. Recommendations:
934387 934387
503329 156309
156322 156322
156309 156329
155230 155230
723245 148407
156313 148695
503480 148680
1029216 152459
872213 148789


In [22]:
purchased_categories = customer_purchases[['cat3']].drop_duplicates()
purchased_categories = set(purchased_categories['cat3'].tolist())

In [26]:
purchase_data = raw_transactions[['global_product_id', 'cat1', 'cat2', 'cat3']].dropna()
new_product_categories_df = (purchase_data.loc[purchase_data['global_product_id'].isin(new_products)]
.drop_duplicates()
.sort_values(by=['global_product_id']))
new_product_categories = set(new_product_categories_df['cat3'].tolist())

In [27]:
new_product_categories.difference(purchased_categories)

{'avocado', 'cucumber'}