In [1]:
# !pip install pandas pyarrow


In [2]:


# Import necessary libraries
import pandas as pd
import numpy as np
import joblib  # For saving the model




In [3]:
# Load Transactions, Reduce Memory
train = pd.read_csv(r'C:\Users\Shweta\recommendation-system\venv\transactions_train.csv')
train['customer_id'] = train['customer_id'].str[-16:].apply(int, base=16)
train['article_id'] = train['article_id'].astype('int32')
train['t_dat'] = pd.to_datetime(train['t_dat'])
train = train[['t_dat', 'customer_id', 'article_id']]
train.to_parquet('train.pqt', index=False)
print(train.shape)
train.head()

(31788324, 3)


Unnamed: 0,t_dat,customer_id,article_id
0,2018-09-20,18439897732908966680,663713001
1,2018-09-20,18439897732908966680,541518023
2,2018-09-20,10112112306570742978,505221004
3,2018-09-20,10112112306570742978,685687003
4,2018-09-20,10112112306570742978,685687004


In [4]:
# Find Each Customer's Last Week of Purchases
tmp = train.groupby('customer_id')['t_dat'].idxmax()
last_purchases = train.loc[tmp].reset_index(drop=True)
last_purchases = last_purchases[['customer_id', 'article_id']]
last_purchases.columns = ['customer_id', 'last_article_id']

In [5]:
# Load Pairs and Recommend Items Purchased Together
last_purchases = last_purchases.copy()
pairs = np.load(r'C:\Users\Shweta\recommendation-system\venv\pairs_cudf.npy', allow_pickle=True).item()
last_purchases['recommended_article_id'] = last_purchases['last_article_id'].map(pairs)

In [6]:
# Keep only non-null recommendations
recommendations = last_purchases[['customer_id', 'recommended_article_id']].dropna()
recommendations = recommendations.drop_duplicates(['customer_id', 'recommended_article_id'])
recommendations = recommendations.rename({'recommended_article_id': 'article_id'}, axis=1)

In [7]:
# Convert article_id to string
recommendations['article_id'] = ' 0' + recommendations['article_id'].astype('str')

# Perform the groupby operation
grouped = recommendations.groupby('customer_id').agg({'article_id': lambda x: ' '.join(x)}).reset_index()

# Select necessary columns
grouped = grouped[['customer_id', 'article_id']]
grouped.columns = ['customer_id', 'prediction']

In [9]:
# Recommend Last Week's Most Popular Items
train = pd.read_parquet('train.pqt')
train['t_dat'] = pd.to_datetime(train['t_dat'])
train = train.loc[train['t_dat'] >= pd.to_datetime('2020-09-16')]
top10_list = [' 0' + str(article_id) for article_id in train['article_id'].value_counts().index[:10]]
top10_str = ' '.join(top10_list)
print("Last week's top 10 popular items:")
print(top10_str)

Last week's top 10 popular items:
 0924243001  0924243002  0918522001  0923758001  0866731001  0909370001  0751471001  0915529003  0915529005  0448509014


In [10]:
# Ensure unique recommendations for each customer
def unique_recommendations(row):
    # Combine customer-specific recommendations with top 10 popular items
    recs = set(row['prediction'].split())
    popular_recs = set(top10_str.split())
    combined_recs = list(recs.union(popular_recs))[:10]
    return ' '.join(combined_recs)

grouped['prediction'] = grouped.apply(unique_recommendations, axis=1)

In [11]:
# Write Submission CSV
sub = pd.read_csv(r'C:\Users\Shweta\recommendation-system\venv\sample_submission.csv')
sub = sub[['customer_id']]
sub['customer_id_2'] = sub['customer_id'].str[-16:].apply(int, base=16)
sub = sub.merge(grouped.rename({'customer_id': 'customer_id_2'}, axis=1), on='customer_id_2', how='left').fillna('')
del sub['customer_id_2']
sub['prediction'] = sub['prediction'].str.strip()
sub['prediction'] = sub['prediction'].str[:131]
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243002 0924243001 0751471001 0923758001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243002 0924243001 0706016001.0 0751471001 ...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243002 0924243001 0751471001 0923758001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243002 0924243001 0751471001 0923758001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243002 0924243001 0751471001 0448509014 09...


In [12]:

# Save the model (recommendations DataFrame)
joblib.dump(grouped, 'recommendations_model.pkl')
print("Model saved as recommendations_model.pkl")


Model saved as recommendations_model.pkl
