In [1]:
#!sudo apt install -y curl unzip

# 오픈소스SW개론 과제 2

In [2]:
%conda install numpy pandas matplotlib scikit-learn scipy -y

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.9.0
  latest version: 24.5.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.5.0



## Package Plan ##

  environment location: /home/lewohy/miniconda3

  added / updated specs:
    - matplotlib
    - numpy
    - pandas
    - scikit-learn
    - scipy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    pandas-2.2.2               |  py311ha02d727_0        15.7 MB
    ------------------------------------------------------------
                                           Total:        15.7 MB

The following packages will be UPDATED:

  pandas                              2.2.1-py311ha02d727_0 --> 2.2.2-py311ha02d727_0 




## 1. 데이터 준비


### 1.1. ml-1m.zip 다운로드


In [3]:
!curl -SLJ https://files.grouplens.org/datasets/movielens/ml-1m.zip --output /tmp/ml-1m.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

100 5778k  100 5778k    0     0   169k      0  0:00:34  0:00:34 --:--:--  271k


### 1.2. ml-1m.zip 압축 해제


In [4]:
!unzip -o /tmp/ml-1m.zip ml-1m/ratings.dat -d /tmp

Archive:  /tmp/ml-1m.zip
  inflating: /tmp/ml-1m/ratings.dat  


In [5]:
!ls -l /tmp/ml-1m

total 24020
-rw-r----- 1 lewohy lewohy 24594131 Mar  1  2003 ratings.dat


## 2. Group Recommender System 구현

### 2.1 Clustering

In [6]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans

ratings = pd.read_csv(
    "/tmp/ml-1m/ratings.dat",
    sep="::",
    header=None,
    engine="python",
    names=["UserID", "MovieID", "Rating", "Timestamp"],
)

user_item_matrix = (
    ratings.pivot(index="UserID", columns="MovieID", values="Rating")
    .fillna(0)
    .values
)


def rank_with_argsort(arr):
    order = arr.argsort()
    ranks = order.argsort() + 1  # 1부터 시작하도록 +1
    return ranks


#### 2.1.1. Matrix 생성


In [7]:
kmeans = KMeans(n_clusters=3, random_state=1)
user_groups = kmeans.fit_predict(user_item_matrix)

### 2.2. Aggregation

In [8]:
def aggregation(user_item_matrix, user_groups, aggregation_func):
    for group_id in set(user_groups):
        group_ratings = user_item_matrix[user_groups == group_id]
        recommendations = aggregation_func(group_ratings)
        top_10_recommendations = np.argsort(recommendations)[-10:]
        print(top_10_recommendations)

#### 2.2.1. Additive Utilitarian

In [9]:
def additive_utilitarian(group_ratings):
    return np.sum(group_ratings, axis=0)

aggregation(user_item_matrix, user_groups, additive_utilitarian)


[1178  579  575 1848 1120 2651 1108 2374  253 1106]
[1107 2374  802  287  579  593 1178 1108 1106  253]
[ 106 2203  513  579 2557 1106 1848 1120  253 2651]


#### 2.2.2. Average

In [10]:
def average(group_ratings):
    return np.mean(group_ratings, axis=0)

aggregation(user_item_matrix, user_groups, average)

[1178  579  575 1848 1120 2651 1108 2374  253 1106]
[1107 2374  802  287  579  593 1178 1108 1106  253]
[ 106 2203  513  579 2557 1106 1848 1120  253 2651]


#### 2.2.3. Simple Count

In [12]:
def simple_count(group_ratings):
    return np.count_nonzero(group_ratings, axis=0)

aggregation(user_item_matrix, user_groups, simple_count)

[1178 1108  466 1449 2651  575 1120 2374  253 1106]
[ 466  575 1108 2511 1025 1120 1449 1178  253 1106]
[2785 2203 2557  466  579 1848 1106 1120  253 2651]


#### 2.2.4. Approval Voting

In [13]:
def approval_voting(group_ratings):
    return np.sum(group_ratings >= 4, axis=0)

aggregation(user_item_matrix, user_groups, approval_voting)

[2557 1120  575  579 1848 2651 2374 1108  253 1106]
[2374  287 2651 1178  593  802  579 1108 1106  253]
[ 593 2203 1120  513 1106  579 2557 1848  253 2651]


#### 2.2.5. Borda Count

In [11]:
from scipy.stats import rankdata


def board_approval(group_ratings):
    return np.sum(
        np.apply_along_axis(
            lambda x: rankdata(x, method="average"),
            axis=1,
            arr=group_ratings,
        ),
        axis=0,
    )


aggregation(user_item_matrix, user_groups, board_approval)

[1848  466 1449 1108  575 2651 1120 2374  253 1106]
[2511  579  287 1025 1120  593 1108 1178  253 1106]
[2785  466 2203 2557  579 1848 1106 1120  253 2651]


#### 2.2.6. Copeland Rule

In [15]:
def copeland_rule(group_ratings):
    result = np.zeros(group_ratings.shape)
    matrix = np.array(group_ratings)
    
    for row in range(matrix.shape[0]):
        for column in range(matrix.shape[1]):
            result[row, column] = np.sum(matrix[row] > matrix[row, column]) - np.sum(matrix[row] < matrix[row, column])

    return result.sum(axis=0)

aggregation(user_item_matrix, user_groups, copeland_rule)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [4. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 0.]]
[]
[[5. 5. 0. ... 0. 0. 0.]
 [5. 0. 0. ... 0. 0. 5.]
 [4. 3. 0. ... 0. 0. 0.]
 ...
 [3. 3. 3. ... 0. 0. 0.]
 [4. 0. 2. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[]
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[]
