In [1]:
import pandas as pd
import numpy as np
import surprise
from surprise.model_selection import GridSearchCV, KFold
import pickle

# Function to preprocess data and build the recommendation model
def preprocess_and_build_model(df):
    # Convert 'product_id' column to string type
    df['product_id'] = df['product_id'].astype(str)

    # Function to convert product ID to generic alphabetic product name based on category
    def convert_product_name(row):
        category = row['product_category_name_english']
        product_id = row['product_id']  # No need to convert to string now
        # Create a generic product name by concatenating category abbreviation and a numerical index
        category_abbreviation = ''.join(word[:].upper() for word in category.split())
        index = int(''.join(filter(str.isdigit, product_id)))  # Extract numerical part from product_id
        product_name = f'{category_abbreviation}_{index}'
        return product_name

    # Convert product IDs to product names
    df['product_name'] = df.apply(convert_product_name, axis=1)

    # Preprocess the data
    lowest_rating = df['review_score'].min()
    highest_rating = df['review_score'].max()

    # Create a dictionary to map unique customer IDs to sequential numbers
    id_mapping = {id_: idx + 1 for idx, id_ in enumerate(df['customer_id'].unique())}

    # Convert customer IDs to simple generic unique numbers
    df['customer_id'] = df['customer_id'].map(id_mapping)

    # Define the Reader
    reader = surprise.Reader(rating_scale=(lowest_rating, highest_rating))
    data = surprise.Dataset.load_from_df(df[['customer_id', 'product_name', 'review_score']], reader)

    # Grid search for best parameters
    param_grid = {'lr_all': np.linspace(0.001, 1, 3), 'reg_all': np.linspace(0.01, 0.8, 3),
                  'n_factors': [40, 30]}
    kfold = KFold(random_state=23, n_splits=5, shuffle=True)
    gs = GridSearchCV(surprise.SVD, param_grid, joblib_verbose=3, measures=['rmse', 'mae'], cv=kfold, n_jobs=-1)
    gs.fit(data)

    algo = gs.best_estimator['rmse']
    algo.fit(data.build_full_trainset())

    # Dumping the model using pickle
    with open('recommendation_model.pkl', 'wb') as model_file:
        pickle.dump(algo, model_file)

    # Dumping id_mapping for later use
    with open('id_mapping.pkl', 'wb') as id_mapping_file:
        pickle.dump(id_mapping, id_mapping_file)

if __name__ == "__main__":
    file_path = r'C:\Users\prati\OneDrive\Desktop\Project\joined_data_set.csv'
    df = pd.read_csv(file_path)
    preprocess_and_build_model(df)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   35.1s finished
