In [2]:
import mlxtend
print(mlxtend.__version__)

0.23.4


In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 1. 读取数据集
# 假设你已经下载了 MovieLens 100K 数据集，并且数据文件名为 'u.data'
# 数据集格式：用户ID\t电影ID\t评分\t时间戳
data = pd.read_csv('u.csv', sep=',', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

In [2]:
data.head(10)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,
1,186,302,3,
2,22,377,1,
3,244,51,2,
4,166,346,1,
5,298,474,4,
6,115,265,2,
7,253,465,5,
8,305,451,3,
9,6,86,3,


In [3]:
# 2. 数据预处理
# 提取用户ID和电影ID，构建用户-电影的交易数据
transactions = data.groupby('user_id')['item_id'].apply(list).tolist()

In [4]:
# 3. 转换为集合
transactions = [set(transaction) for transaction in transactions]

In [16]:
transactions

[{1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  158,
  

In [5]:
# 3. 使用 TransactionEncoder 编码数据
te = TransactionEncoder()

In [6]:
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [7]:
te_ary

array([[ True,  True,  True, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False,  True, False, ..., False, False, False]])

In [8]:
df.tail()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
938,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
939,False,False,False,True,False,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
940,True,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
941,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
942,False,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# 4. 使用 Apriori 算法挖掘频繁项集，最小支持度为0.3
frequent_itemsets = apriori(df, min_support=0.3,use_colnames=True)
print("频繁项集：")
print(frequent_itemsets)

频繁项集：
      support             itemsets
0    0.479321                  (1)
1    0.415695                  (7)
2    0.317073                  (9)
3    0.310710                 (15)
4    0.314952                 (22)
..        ...                  ...
121  0.357370       (50, 181, 174)
122  0.306469       (50, 181, 222)
123  0.302227       (50, 258, 181)
124  0.312831      (172, 181, 174)
125  0.311771  (50, 172, 181, 174)

[126 rows x 2 columns]


In [11]:
# 5. 计算关联规则，最小置信度为0.7
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

In [14]:
filtered_rules=rules[ rules['lift'] >2.3]

In [16]:
print("\n关联规则：")
print(filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


关联规则：
    antecedents consequents   support  confidence      lift
115   (50, 172)  (181, 174)  0.311771    0.852174  2.349708
116   (50, 174)  (172, 181)  0.311771    0.773684  2.301527
117  (172, 181)   (50, 174)  0.311771    0.927445  2.301527
119  (181, 174)   (50, 172)  0.311771    0.859649  2.349708


In [None]:
# 加载u.item文件
u_item = pd.read_csv('u_item.csv', header=None, names=['movie_id', 'movie_title'],usecols=[0,1], encoding='latin-1')
movie_names = u_item.set_index('movie_id')['movie_title'].to_dict()

In [21]:
def replace_movie_ids_with_names(rule):
    antecedents = rule['antecedents']
    consequents = rule['consequents']
    antecedents_names = {movie_names[movie_id] for movie_id in antecedents}
    consequents_names = {movie_names[movie_id] for movie_id in consequents}
    return antecedents_names, consequents_names

# 应用替换函数
rules['antecedents_names'], rules['consequents_names'] = zip(*rules.apply(replace_movie_ids_with_names, axis=1))

# 查看结果
print(rules[['antecedents_names', 'consequents_names', 'support', 'confidence', 'lift']])

                                     antecedents_names  \
115            {Empire Strikes Back, Star Wars (1977)}   
116  {Raiders of the Lost Ark (1981), Star Wars (19...   
117   {Empire Strikes Back, Return of the Jedi (1983)}   
119  {Return of the Jedi (1983), Raiders of the Los...   

                                     consequents_names   support  confidence  \
115  {Return of the Jedi (1983), Raiders of the Los...  0.311771    0.852174   
116   {Empire Strikes Back, Return of the Jedi (1983)}  0.311771    0.773684   
117  {Raiders of the Lost Ark (1981), Star Wars (19...  0.311771    0.927445   
119            {Empire Strikes Back, Star Wars (1977)}  0.311771    0.859649   

         lift  
115  2.349708  
116  2.301527  
117  2.301527  
119  2.349708  
