In [2]:
!pip install scikit-surprise



In [3]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import Reader

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
data = pd.read_csv("preprocessed_review.csv")

In [5]:
# Surprise 라이브러리용 Reader 객체 생성
reader = Reader(rating_scale=(0, 100))

In [6]:
# Surprise 데이터셋으로 변환
surprise_data = Dataset.load_from_df(data[['직무, 지역', '회사명', '총점']], reader)

In [7]:
# 전체 데이터를 학습 데이터셋으로 사용
trainset = surprise_data.build_full_trainset()

In [8]:
# SVD 모델 생성
model = SVD()

In [9]:
# 모델 학습
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f0739578760>

In [10]:
# 유저의 평점
ex_ratings = pd.DataFrame({'회사명': ['CJ씨푸드', 'LG전자'], '평점': ['10', '80']})

In [11]:
# 예시 유저가 평가한 기업 목록
rated_companies = ex_ratings['회사명'].tolist()

In [12]:
# 예시 유저가 평가하지 않은 기업 목록
unrated_companies = [company for company in data['회사명'].unique() if company not in rated_companies]

In [13]:
# 예시 유저의 평가하지 않은 기업에 대한 평점 예측
predictions = [(company, model.predict('직무, 지역', company).est) for company in unrated_companies]

In [14]:
# 예측 결과를 데이터프레임으로 변환
recommendations_df = pd.DataFrame(predictions, columns=['회사명', '평점'])

In [15]:
# 예측 평점이 높은 순으로 정렬하여 상위 10개 기업 추천
top_n_recommendations = recommendations_df.sort_values(by='평점', ascending=False).head(10)

In [16]:
print(top_n_recommendations)

             회사명         평점
320        스마트전자  59.416763
1799      한국전력기술  59.071446
783   연세고운미소치과의원  58.951284
61         바디텍메드  58.807774
654       에스피씨삼립  58.784468
1661      한국남동발전  58.761900
1678      한국동서발전  58.541611
124       부산교통공사  58.482237
834          오존텍  58.390737
1694    한국무역보험공사  58.387609


In [136]:
import sagemaker
import boto3
import os
import io
import numpy as np
from sagemaker import Session
import sagemaker.amazon.common as smac
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import lil_matrix

In [137]:
# Set up SageMaker session and S3 bucket
sagemaker_session = sagemaker.Session()
bucket = 'sagemaker-ml1-job'
prefix = 'model_based_CF'

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [138]:
# Get the IAM role
role = sagemaker.get_execution_role()
print(role)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
arn:aws:iam::629515838455:role/service-role/AmazonSageMaker-ExecutionRole-20231207T141059


In [139]:
# Determine the number of customers, products, and feature dimensions
nb_customer = data['직무_지역_idx'].max() + 1
nb_product = data['회사명_idx'].max() + 1
feature_dim = nb_customer + nb_products
print(nb_customer, nb_products, feature_dim)

293 2002 2295


In [140]:
# Extract relevant columns from the data
product_df = data[['직무_지역_idx', '회사명_idx', '총점']]
product_df.head()

Unnamed: 0,직무_지역_idx,회사명_idx,총점
0,50,0,60.0
1,117,0,70.0
2,133,0,40.0
3,136,1,40.0
4,190,1,40.0


In [141]:
# Function to convert the dataframe to a sparse matrix
def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products):
    # Convert dataframe to array
    df_val = df.values

    # Determine feature size
    nb_cols = nb_customer + nb_products
    print("# of rows = {}".format(str(nb_rows)))
    print("# of cols = {}".format(str(nb_cols)))

    # Extract customers and ratings
    df_X = df_val[:, 0:2]
    # Features are one-hot encoded in a sparse matrix
    X = lil_matrix((nb_rows, nb_cols)).astype('float32')
    df_X[:, 1] = nb_customer + df_X[:, 1]
    coords = df_X[:, 0:2]
    X[np.arange(nb_rows), coords[:, 0]] = 1
    X[np.arange(nb_rows), coords[:, 1]] = 1

    # Create label with ratings
    Y = df_val[:, 2].astype('float32')

    # Validate size and shape
    print(X.shape)
    print(Y.shape)
    assert X.shape == (nb_rows, nb_cols)
    assert Y.shape == (nb_rows, )

    return X, Y

In [142]:
# Convert the dataframe to sparse matrix format
X, Y = convert_sparse_matrix(product_df, product_df.shape[0], nb_customer, nb_product)

# of rows = 6571
# of cols = 2295
(6571, 2295)
(6571,)


In [143]:
# Write the sparse matrix to a buffer
buf = io.BytesIO() 
smac.write_spmatrix_to_sparse_tensor(buf, X, y)
buf.seek(0) 

0

In [144]:
# Upload to S3
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'preprocessed_review.protobuf')).upload_fileobj(buf)

In [145]:
# Define S3 paths for training data and model output
s3_train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'preprocessed_review.protobuf')
output_location = 's3://{}/{}/output'.format(bucket, prefix)

In [146]:
# Get the factorization machines container
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'factorization-machines', 'latest')
print(container)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


712309505854.dkr.ecr.ap-southeast-2.amazonaws.com/factorization-machines:1


In [147]:
# Create an Estimator for training the factorization machines model
fm_model = sagemaker.estimator.Estimator(container,
                                         role,         
                                         train_instance_count = 1,
                                         train_instance_type='ml.c4.xlarge',
                                         output_path=output_location,
                                         sagemaker_session=sagemaker_session)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [148]:
# Set hyperparameters for the factorization machines model
fm_model.set_hyperparameters(feature_dim=feature_dim,
                      predictor_type='regressor',
                      mini_batch_size=200,
                      num_factors=512,
                      bias_lr=0.02,
                      epochs=10)

In [149]:
# Train the factorization machines model
fm_model.fit({'train': s3_train_data})

INFO:sagemaker:Creating training-job with name: factorization-machines-2023-12-09-05-55-55-487


2023-12-09 05:55:55 Starting - Starting the training job...
2023-12-09 05:56:09 Starting - Preparing the instances for training......
2023-12-09 05:57:11 Downloading - Downloading input data...
2023-12-09 05:57:41 Training - Downloading the training image............
2023-12-09 05:59:47 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[12/09/2023 05:59:58 INFO 139917187471168] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'linear_init_method': 'normal', 'linear_init

In [150]:
# Deploying the model to perform inference 

predictor = fm_model.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: factorization-machines-2023-12-09-06-01-30-551
INFO:sagemaker:Creating endpoint-config with name factorization-machines-2023-12-09-06-01-30-551
INFO:sagemaker:Creating endpoint with name factorization-machines-2023-12-09-06-01-30-551


--------!

In [None]:
# 추론 엔드포인트 삭제
predictor.delete_endpoint()