In [31]:
import pandas as pd
import numpy as np 
import math

In [32]:
train_data = pd.read_csv("datasets/ml-100k/u1.base",sep='\t',names=['uid','iid','rating'],usecols=[0,1,2],header=None)

In [33]:
user_set = set(train_data['uid'])
item_set = set(train_data['iid'])

In [35]:
len(user_set)

943

In [10]:
rating = train_data.values

In [11]:
train_data.values

array([[   1,    1,    5],
       [   1,    2,    3],
       [   1,    3,    4],
       ...,
       [ 943, 1188,    3],
       [ 943, 1228,    3],
       [ 943, 1330,    3]], dtype=int64)

In [4]:
# PMF 模型
class PMF:
    # PMF 模型初始化，已经设置默认参数
    def __init__(self, user_set, item_set, record_list, dimensions=20, learning_rate=0.01, alpha_user=0.1, alpha_item=0.1):
        # 创建PMF时，表示用户id的set集合。调用vector_initialize函数后，表示用户的特征矩阵 {用户id：用户特征向量，...}
        self.users = user_set
        # 同上
        self.items = item_set
        # 训练集中的记录列表
        self.records = record_list
        # 用户和物品的特征维度，默认为20
        self.dimensions = dimensions
        # 学习率，默认为0.01
        self.learning_rate = learning_rate
        # 用户正则化的超参数，默认为0.1
        self.alpha_user = alpha_user
        # 物品正则化的超参数，默认为0.1
        self.alpha_item = alpha_item
        # 训练过程中的损失
        self.loss = 0

    # 初始化用户特征和物品特征
    def vector_initialize(self):
        # 用户和物品的特征使用字典来保存，Key是ID，Value是相应的特征向量
        users_dict = {}
        items_dict = {}
        # 用户特征初始化
        for user in self.users:
            # 生成维度20，服从0~1的均匀分布的向量
            user_vector = np.random.rand(self.dimensions)
            # 保存此用户的特征向量
            users_dict[user] = (user_vector - 0.5) * 0.01
        # 物品特征初始化
        for item in self.items:
            item_vector = np.random.rand(self.dimensions)
            items_dict[item] = (item_vector - 0.5) * 0.01
        # 更新模型的两个属性
        self.users = users_dict
        self.items = items_dict

    # 使用随机梯度下降方法训练用户和物品的特征
    def train(self, epochs):
        # 迭代次数
        for epoch in range(epochs):
            # 每次迭代开始，将模型的属性loss置0
            self.loss = 0
            # 遍历评分记录
            for record in self.records:
                # 该记录的用户特征向量
                user = self.users[record[0]]
                # 该记录的物品特征向量
                item = self.items[record[1]]
                # 该记录的用户对物品的评分
                rating = int(record[2])
                # 计算损失
                error = self.loss_function(user, item, rating)
                # 损失累加
                self.loss += error
                # 计算该用户特征向量的梯度
                grad_user = -(rating - np.dot(user, item)) * item + self.alpha_user * user
                # 计算该物品特征向量的梯度
                grad_item = -(rating - np.dot(user, item)) * user + self.alpha_item * item
                # 根据梯度对特征向量进行更新
                self.users[record[0]] -= self.learning_rate * grad_user
                self.items[record[1]] -= self.learning_rate * grad_item
            # 每迭代完一次，学习率降低
            self.learning_rate = self.learning_rate * 0.9
            # 打印每次迭代的损失
            print("epoch: ", epoch, "loss:", self.loss)

        # 训练完之后，将用户特征向量进行保存
        with codecs.open("pureResult/user_vector", "w") as f1:
            for u in self.users.keys():
                f1.write(str(u) + "\t")
                f1.write(str(list(self.users[u])))
                f1.write("\n")
        # 将物品特征向量进行保存
        with codecs.open("pureResult/item_vector", "w") as f2:
            for i in self.items.keys():
                f2.write(str(i) + "\t")
                f2.write(str(list(self.items[i])))
                f2.write("\n")

    # 损失函数定义
    def loss_function(self, user, item, rating):
        return 0.5 * math.pow((rating - np.dot(user, item)), 2) + \
               0.5 * self.alpha_user * math.pow(np.linalg.norm(user, ord=2), 2) + \
               0.5 * self.alpha_item * math.pow(np.linalg.norm(item, ord=2), 2)

In [22]:
model = PMF(user_set, item_set, rating)
model.vector_initialize()
model.train(5)

  grad_user = -(rating - np.dot(user, item)) * item + self.alpha_user * user


OverflowError: math range error

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
 
# 假设有一个简单的模型和数据集
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc = nn.Linear(10, 1)
 
    def forward(self, x):
        x = self.fc(x)
        return x
 
# 假设的数据集
x = torch.randn(100, 10)
y = torch.randn(100, 1)
 
# 准备数据加载器
dataset = Data.TensorDataset(x, y)
loader = Data.DataLoader(dataset, batch_size=10, shuffle=True, num_workers=2)
 
# 初始化模型和优化器
model = Model()
optimizer = optim.SGD(model.parameters(), lr=0.1)
 
# 训练循环
for epoch in range(100):
    for i, (batch_x, batch_y) in enumerate(loader):
        # 前向传播
        y_pred = model(batch_x)
        loss = ((y_pred - batch_y) ** 2).sum()
 
        # 清零梯度
        optimizer.zero_grad()
        # 反向传播
        loss.backward()
        # 更新权重
        optimizer.step()
 
    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')

Epoch 1, Loss: 123223056.0000
Epoch 2, Loss: 84386795877302272.0000
Epoch 3, Loss: 751560777873182410407936.0000
Epoch 4, Loss: 22858458857784059917903304065024.0000



KeyboardInterrupt



In [59]:
import codecs
import json

In [114]:
file = codecs.open("pureResult/item_vector.json", "r", "utf-8")

In [115]:
content = file.read()

In [64]:
content

'{"1": [-0.0015766867092916192, 0.0022490202577129833, -0.005635656223982155, -0.0021297452464476754, 0.0052499516527183725, -0.0012690718541510776, -0.0004861801339663105, -0.003415016628343603, -0.006181622769586907, -0.005831316528517045, -0.0003982532449330293, 0.006789915371939293, -0.0020676558175048753, -0.0062923402659228085, -0.00015053517191359564, -0.0014717387899709122, -1.454157878944295e-05, -0.0037212496109492554, -0.000498909742940636, 0.0048351191010936306], "2": [-0.0032276809361565352, 0.00046938799974652193, -0.00047705099034122425, -0.0028287713890828266, 0.00046907533248067255, -0.001437351548147324, -0.001189361193114463, 0.003866049537689445, -0.00354513463702315, -0.0008016689322081032, -0.005279424491405487, -0.001081257019429318, -0.0013227199390384017, -0.0023739316613983256, -0.004021889305334145, -0.0010284929161872517, -0.0051869275620897385, -0.0039268724902097535, 0.003844646645665846, 0.004858101145559624], "3": [-0.0008236067879048499, 0.0038578403298

In [116]:
item_vector = json.loads(content)

In [117]:
file = codecs.open("pureResult/user_vector.json", "r", "utf-8")

In [118]:
content = file.read()
user_vector = json.loads(content)

In [70]:
user_vector

{'1': [-0.03758313302681846,
  -0.918107736221355,
  0.43167768361661035,
  0.16923769151421858,
  0.7404123150312156,
  0.4385038919933102,
  -0.3143345998011313,
  -0.3334081069338417,
  -0.4295949207383352,
  -0.25567931246781306,
  0.03702828132984134,
  0.5906341894853022,
  -0.0458162466468148,
  0.769752743436707,
  -0.03405384183884127,
  0.16848745166012088,
  -0.8867533170072945,
  0.24017462843838572,
  0.26878334161046885,
  0.28776032460486994],
 '2': [-0.02524723477096431,
  -0.9134549839699375,
  0.4420921289841642,
  0.1584532343897484,
  0.7325760570074072,
  0.4468533876082581,
  -0.3134533456340006,
  -0.33348625807025495,
  -0.41572556670350275,
  -0.2645510165913871,
  0.04006181115097193,
  0.5765317360013944,
  -0.04671259767554464,
  0.7808761542985869,
  -0.027590514048125035,
  0.1678431103741691,
  -0.8992623505315832,
  0.24528067606844584,
  0.2909348905136625,
  0.300306191579673],
 '3': [-0.0026710102255696104,
  -0.7970180229697433,
  0.38174020342777415

### 测试

In [85]:
import numpy as np

In [198]:
train_data = pd.read_csv("datasets/ml-100k/u1.base",sep='\t',names=['uid','iid','rating'],usecols=[0,1,2],header=None)

In [199]:
se = trainset.groupby('userId').agg([list])[["movieId", "rating"]]

In [217]:
se

Unnamed: 0_level_0,movieId,rating
Unnamed: 0_level_1,list,list
userId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,"[1, 2, 3, 4, 5, 7, 8, 9, 11, 13, 15, 16, 18, 1...","[5, 3, 4, 3, 3, 4, 1, 5, 2, 5, 5, 5, 4, 5, 1, ..."
2,"[1, 10, 14, 25, 100, 111, 127, 237, 242, 255, ...","[4, 2, 4, 4, 5, 4, 5, 4, 5, 4, 3, 4, 5, 4, 3, ..."
3,"[181, 258, 260, 268, 271, 288, 302, 303, 317, ...","[4, 2, 4, 3, 3, 2, 2, 3, 2, 2, 5, 5, 3, 1, 2, ..."
4,"[11, 210, 258, 271, 300, 301, 324, 327, 328, 3...","[4, 3, 5, 4, 5, 5, 5, 5, 3, 5, 2, 5, 5, 5]"
5,"[21, 25, 29, 50, 63, 66, 70, 95, 99, 101, 105,...","[3, 3, 4, 4, 1, 1, 4, 4, 3, 5, 3, 4, 4, 1, 3, ..."
...,...,...
939,"[9, 15, 106, 118, 121, 127, 220, 222, 237, 252...","[5, 5, 3, 5, 5, 5, 5, 5, 5, 3, 3, 5, 5, 4, 2, ..."
940,"[4, 7, 8, 9, 12, 14, 47, 50, 56, 66, 69, 70, 8...","[2, 4, 5, 3, 4, 3, 3, 4, 5, 4, 2, 3, 4, 4, 5, ..."
941,"[1, 7, 15, 117, 124, 147, 181, 222, 257, 258, ...","[5, 4, 4, 5, 5, 4, 5, 2, 4, 4, 3, 4, 5, 4, 2, ..."
942,"[31, 50, 71, 79, 95, 97, 99, 117, 124, 131, 13...","[5, 5, 5, 5, 5, 5, 5, 4, 4, 5, 3, 5, 5, 3, 5, ..."


In [220]:
import torch.optim as optim

In [223]:
scheduler_lr = optim.lr_scheduler.StepLR(None, step_size=50, gamma=0.1)

TypeError: NoneType is not an Optimizer

In [189]:
test_data.values

array([[  1,   6,   5],
       [  1,  10,   3],
       [  1,  12,   5],
       ...,
       [459, 934,   3],
       [460,  10,   3],
       [462, 682,   5]], dtype=int64)

In [74]:
predicts = []
for i,row in test_data.iterrows():
    rating = row['rating']
    uid = row['uid']
    iid = row['iid']
    predict_value = user_vector.get('uid') 

5
3
5
5
3
4
4
3
2
3
4
2
4
5
4
3
4
3
3
4
5
4
3
5
4
3
3
3
4
3
1
4
1
4
5
5
4
3
5
4
5
3
5
3
4
5
2
1
1
4
5
1
5
5
3
3
1
4
3
4
5
3
4
4
1
1
2
2
5
4
5
2
4
3
4
4
4
3
5
5
5
5
5
3
5
4
4
4
3
3
5
4
5
3
3
5
4
5
4
4
4
2
4
3
3
1
5
4
5
2
3
4
5
4
4
3
2
5
4
4
5
1
4
4
2
5
1
2
5
1
1
3
2
4
1
4
3
4
3
5
5
4
4
3
3
3
4
4
3
4
4
4
3
3
3
5
1
1
5
1
2
2
2
3
2
3
4
2
2
4
5
2
4
1
3
1
1
1
3
3
4
3
3
3
3
5
4
3
4
5
5
5
3
4
5
4
3
4
4
4
5
4
1
3
2
5
3
3
3
5
3
5
1
3
3
3
5
3
2
4
3
3
5
4
3
3
4
2
4
3
2
2
1
1
1
4
3
1
1
1
3
2
1
3
5
3
4
2
1
4
2
2
2
1
1
3
3
1
1
3
3
3
1
4
4
1
3
5
3
5
4
5
1
1
2
3
1
1
1
5
3
4
4
2
4
4
5
3
3
4
3
4
2
5
5
2
5
3
5
4
5
5
5
2
3
3
3
4
4
4
4
3
3
3
4
5
4
3
3
4
4
5
4
4
5
3
2
4
4
2
2
2
2
3
2
4
4
4
3
4
1
2
4
4
4
5
3
2
5
1
1
4
5
4
5
5
5
4
5
5
5
4
4
4
4
3
4
5
4
4
4
3
4
5
3
5
3
4
3
4
5
5
4
5
5
4
5
5
3
5
3
4
5
3
5
3
5
5
5
4
2
5
3
5
5
5
5
3
3
4
5
5
5
3
5
4
3
5
3
4
5
3
4
3
4
4
5
5
5
5
5
5
5
5
5
5
4
5
4
1
5
5
3
4
3
4
3
1
4
4
3
3
3
3
5
4
4
5
5
5
4
5
4
4
4
4
4
5
4
5
5
5
3
4
4
5
3
4
5
4
4
4
2
4
4
5
3
4
5
5
3
5
5
5
5
4
5
5
5
3


5
4
4
5
3
4
4
4
4
2
4
1
3
5
3
2
3
4
3
3
2
4
4
4
3
1
4
2
2
5
4
3
5
4
4
3
2
2
4
4
2
2
4
4
4
3
3
3
3
3
4
3
4
5
2
1
5
5
4
3
3
1
3
2
4
5
4
5
4
5
4
2
4
1
4
4
3
3
3
3
4
4
4
3
4
5
5
3
5
4
3
5
4
5
3
4
3
4
4
3
3
3
5
2
5
4
4
2
4
4
5
3
3
1
5
4
3
2
3
3
4
5
4
3
4
3
3
4
2
4
4
4
2
4
3
1
4
1
3
5
4
4
4
1
5
4
5
5
2
2
4
2
2
3
2
3
4
3
4
5
1
2
3
4
1
1
2
3
1
4
3
1
2
1
5
5
5
5
5
4
4
5
4
5
4
4
5
3
4
5
3
4
2
4
5
5
4
2
5
4
5
5
5
5
5
5
5
5
5
4
5
4
4
1
3
5
5
5
5
4
4
5
4
4
3
5
3
5
5
5
4
3
5
3
1
2
3
5
3
3
3
3
5
4
5
5
5
5
3
4
4
5
3
1
2
3
2
5
5
4
4
4
4
4
5
3
3
3
5
3
3
5
1
4
2
3
4
3
4
4
2
4
4
4
4
3
2
5
3
3
3
4
4
4
4
4
2
3
1
3
5
4
2
3
2
1
3
3
3
3
5
1
4
4
2
3
2
3
2
4
5
5
1
3
3
3
3
4
3
2
4
3
4
4
4
3
3
3
2
4
4
2
3
4
1
2
3
5
3
2
3
4
3
2
4
3
3
3
3
3
3
2
2
2
2
2
2
1
3
2
4
3
3
1
3
2
3
4
2
3
4
2
4
3
3
3
3
2
2
3
2
3
3
2
4
2
4
4
4
4
2
3
3
3
3
2
2
3
3
3
3
4
3
3
3
3
3
2
3
2
3
2
3
2
3
3
3
1
2
3
3
4
1
3
3
2
3
2
2
2
3
2
4
3
3
3
2
3
3
3
3
3
3
2
3
2
2
3
2
2
3
2
3
2
2
2
2
2
1
2
1
2
4
5
3
4
4
3
5
4
4
3
3
3
4
5
3
3
4
2
3
3
5
2
2
2
3
5
3
4


3
3
4
1
4
5
4
3
3
4
4
2
3
3
3
1
3
4
4
3
4
5
1
3
1
3
5
4
4
1
4
3
4
2
3
5
1
5
1
2
1
4
1
1
3
4
1
5
4
4
5
4
3
2
1
1
2
2
3
3
5
4
4
4
3
3
5
3
4
3
3
3
5
3
4
3
4
3
1
1
5
5
5
5
4
1
4
1
3
3
4
5
5
4
4
4
4
4
1
3
3
4
4
3
4
5
4
4
4
5
4
4
3
4
4
5
2
1
3
5
3
3
5
4
4
4
5
4
1
3
5
5
5
4
4
4
4
3
4
2
3
4
3
3
4
3
5
3
4
3
2
4
4
3
4
3
4
3
4
4
3
3
2
3
3
4
3
3
5
1
5
5
3
5
3
3
3
1
3
5
3
4
5
3
4
4
1
4
3
4
3
2
3
5
4
2
3
4
2
5
5
5
2
5
2
3
4
4
4
3
1
4
4
5
5
4
5
3
4
5
4
4
3
5
4
2
4
2
5
2
1
4
2
2
1
4
2
5
2
3
3
4
3
4
5
3
4
3
4
1
4
1
3
1
4
1
4
2
3
3
3
3
2
1
3
4
4
2
3
4
1
3
2
4
2
3
2
2
5
1
4
3
3
1
3
1
4
3
2
1
3
3
4
5
5
4
4
3
2
3
3
2
4
3
2
2
2
2
3
3
2
3
4
3
4
1
3
3
2
3
2
1
3
4
4
3
1
4
1
2
1
1
2
4
2
3
4
1
4
4
2
3
5
1
5
4
2
2
4
5
3
5
1
2
1
4
1
4
4
4
1
4
5
3
4
1
4
2
3
4
1
3
1
3
4
3
4
1
3
3
3
1
4
2
1
5
1
2
4
2
3
1
3
4
1
5
3
3
3
4
4
4
3
1
3
1
4
4
4
4
1
4
3
2
3
3
3
4
3
4
4
3
5
2
3
1
3
4
4
4
1
3
3
3
1
1
3
2
4
1
5
3
4
3
2
4
2
3
4
3
1
4
2
3
3
3
2
2
5
3
2
3
1
3
2
1
2
3
3
5
4
5
2
5
4
5
2
5
5
4
4
4
5
5
4
4
4
5
3
4
5
4
3
5
4
5
4
2
4
3


4
4
4
5
4
3
4
4
2
3
2
2
2
4
2
2
2
3
2
5
3
3
4
3
1
5
3
4
2
4
2
4
5
4
5
5
4
3
5
3
5
4
2
2
4
2
5
5
3
3
1
4
5
3
3
4
5
4
3
5
5
5
5
5
5
5
5
4
5
4
5
5
5
5
5
4
4
5
4
3
4
4
4
5
4
5
3
2
4
3
4
3
5
5
3
4
3
5
4
3
4
1
4
5
1
1
2
1
3
2
2
3
4
1
2
2
3
1
4
4
1
1
4
4
4
4
3
4
4
3
4
4
5
4
4
3
5
2
4
3
4
3
4
5
4
5
4
3
2
3
2
4
4
1
3
2
4
4
5
5
1
4
3
3
5
2
2
3
4
2
5
4
2
2
1
4
3
4
1
3
5
2
3
3
3
2
2
3
3
3
4
2
2
2
3
5
2
4
3
3
4
3
2
4
2
2
1
2
4
4
3
2
2
4
4
3
1
3
2
4
1
2
3
4
3
3
1
5
1
4
4
3
5
3
4
3
2
3
3
3
4
1
4
1
5
2
1
4
3
1
3
3
3
1
4
3
2
3
3
5
4
4
4
5
3
4
4
4
4
4
2
5
3
2
3
4
2
5
5
3
3
2
5
2
3
3
1
3
3
2
4
3
3
4
4
4
3
3
3
5
1
2
3
4
4
5
3
4
2
5
4
4
5
5
5
3
4
3
3
5
4
4
4
3
3
4
3
5
5
5
2
4
1
4
4
3
4
4
5
3
4
5
5
4
5
5
4
4
5
4
3
4
3
4
4
3
4
5
5
4
5
4
3
3
3
5
3
4
5
5
5
5
4
4
3
5
4
3
3
4
5
5
3
5
3
5
3
2
3
5
5
3
4
4
4
2
4
5
5
3
5
4
5
3
4
1
4
4
4
4
2
4
4
5
4
5
4
3
4
4
2
3
3
4
3
4
4
4
4
4
5
4
4
5
4
4
3
5
4
3
4
4
4
5
3
3
3
4
4
3
4
4
3
3
4
3
4
5
5
4
4
4
4
5
4
5
3
3
4
4
3
3
3
3
3
3
5
2
3
3
4
3
3
5
1
3
4
2
3
4
4
4
3
4
4
5
4
4
3
4


3
3
2
4
1
3
2
3
3
3
3
2
4
5
3
3
3
3
2
2
5
3
3
3
2
4
3
2
3
4
3
3
4
5
3
3
3
2
2
2
3
3
1
3
3
3
3
3
3
4
3
3
4
2
3
1
4
4
4
4
3
1
3
5
3
5
4
3
5
4
1
2
3
1
2
1
3
5
4
1
4
4
3
2
4
3
4
4
3
4
3
3
3
4
3
5
4
4
5
5
4
3
4
4
5
5
3
4
5
4
4
4
5
2
2
3
4
2
4
4
1
4
1
3
2
1
3
4
1
3
5
5
1
1
5
3
5
1
2
5
1
5
4
2
1
1
2
4
5
1
2
3
1
3
1
1
1
1
1
1
3
1
1
1
1
1
5
1
4
1
1
2
1
4
1
1
5
1
1
1
1
2
1
5
1
1
1
3
1
1
1
1
1
1
1
1
1
1
5
5
1
2
1
1
1
1
2
1
1
3
1
1
1
1
3
1
2
1
3
1
1
1
1
1
1
3
2
1
1
1
1
1
1
1
3
1
1
1
3
1
3
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
3
4
1
1
1
1
1
1
1
1
1
4
4
3
3
5
5
4
3
2
5
5
5
5
3
4
5
5
1
2
2
3
4
5
3
4
2
1
4
5
4
4
5
4
4
5
4
3
4
5
3
5
3
4
4
3
5
4
5
3
3
5
5
3
3
2
4
5
4
4
4
3
3
4
4
4
3
3
3
4
3
4
3
3
4
2
2
3
5
4
4
4
4
4
4
4
4
3
5
2
3
4
3
4
3
2
4
3
3
2
4
4
3
3
3
4
3
3
3
3
5
4
3
2
2
5
5
4
3
3
4
5
4
5
4
3
3
4
2
1
3
5
4
4
2
1
4
2
2
5
5
2
3
4
1
2
4
3
5
4
1
4
2
2
4
4
4
2
1
4
3
2
2
3
5
5
3
4
4
4
4
3
3
4
3
4
5
4
4
3
3
5
4
4
4
5
3
5
4
3
1
4
5
3
4
4
5
3
2
4
1
5
5
2
3
1
4
5
5
5
5
4
4
3
5
5
5
4
5
5
5
4
4
4
5
4
4
2


In [119]:
np.dot(user_vector.get('1'),item_vector.get('6'))

-2.66050302554499e-05

In [126]:
trainset = pd.read_csv('datasets/ml-100k/u1.base', sep='\t', names=["userId", "movieId", "rating"], usecols=range(3))

In [165]:
for j in trainset.groupby('userId').agg([list])[["movieId", "rating"]]:
    print(j)
    

('movieId', 'list')
('rating', 'list')


In [191]:
se = trainset.groupby('userId').agg([list])[["movieId", "rating"]]

In [195]:
se

Unnamed: 0_level_0,movieId,rating
Unnamed: 0_level_1,list,list
userId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,"[1, 2, 3, 4, 5, 7, 8, 9, 11, 13, 15, 16, 18, 1...","[5, 3, 4, 3, 3, 4, 1, 5, 2, 5, 5, 5, 4, 5, 1, ..."
2,"[1, 10, 14, 25, 100, 111, 127, 237, 242, 255, ...","[4, 2, 4, 4, 5, 4, 5, 4, 5, 4, 3, 4, 5, 4, 3, ..."
3,"[181, 258, 260, 268, 271, 288, 302, 303, 317, ...","[4, 2, 4, 3, 3, 2, 2, 3, 2, 2, 5, 5, 3, 1, 2, ..."
4,"[11, 210, 258, 271, 300, 301, 324, 327, 328, 3...","[4, 3, 5, 4, 5, 5, 5, 5, 3, 5, 2, 5, 5, 5]"
5,"[21, 25, 29, 50, 63, 66, 70, 95, 99, 101, 105,...","[3, 3, 4, 4, 1, 1, 4, 4, 3, 5, 3, 4, 4, 1, 3, ..."
...,...,...
939,"[9, 15, 106, 118, 121, 127, 220, 222, 237, 252...","[5, 5, 3, 5, 5, 5, 5, 5, 5, 3, 3, 5, 5, 4, 2, ..."
940,"[4, 7, 8, 9, 12, 14, 47, 50, 56, 66, 69, 70, 8...","[2, 4, 5, 3, 4, 3, 3, 4, 5, 4, 2, 3, 4, 4, 5, ..."
941,"[1, 7, 15, 117, 124, 147, 181, 222, 257, 258, ...","[5, 4, 4, 5, 5, 4, 5, 2, 4, 4, 3, 4, 5, 4, 2, ..."
942,"[31, 50, 71, 79, 95, 97, 99, 117, 124, 131, 13...","[5, 5, 5, 5, 5, 5, 5, 4, 4, 5, 3, 5, 5, 3, 5, ..."


In [137]:
grouped = df.groupby('A')

In [2]:
list(range(1,2))

[1]

In [136]:
result = grouped.agg({
    'B': 'sum',
    'C': ['mean', 'max']
})
print(result)

    B    C    
  sum mean max
A             
1   5   10  10
2   6   11  11
3   7   12  12
4   8   13  13
5   9   14  14


In [3]:
ss = {1:1,2:2}

In [6]:
aa = [1,2,3,4]

In [98]:
import numpy as np
a = [[1,2,3],[2,3,4],[1,3,3]]
a_ndarray = np.array(a)
a_ndarray[[0,2]] = [[1,2,2],[1,3,4]]

In [99]:
a_ndarray

array([[1, 2, 2],
       [2, 3, 4],
       [1, 3, 4]])

In [69]:
b == False

array([False, False, False])

In [56]:
list(range(1,10))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [38]:
lena = len(a) if 1 in a else len(a)-1

In [43]:
import time  
  
# 创建一个包含1000000个元素的列表  
numbers = list(range(1, 1000001))  
  
# 使用sum()函数求和  
start_time = time.time()  
total_sum_sum = sum(numbers)  
end_time = time.time()  
print(f"sum() time: {end_time - start_time}")  
  
# 使用for循环求和  
start_time = time.time()  
total_sum_loop = 0  
for number in numbers:  
    total_sum_loop += number  
end_time = time.time()  
print(f"for loop time: {end_time - start_time}")

sum() time: 0.03989362716674805
for loop time: 0.19547510147094727


In [86]:
import numpy as np  
  
# 假设我们有一个二维数组，其中每行是一个向量  
vectors = np.array([[1, 4], [3, 4], [5, 6]])  
  
# 计算每个向量的模长  
magnitudes = np.linalg.norm(vectors, axis=1)  
  
# 显示结果  
print(magnitudes**2)  # 输出: [2.23606798 5.         7.07106781]

[17. 25. 61.]


In [83]:
import numpy
import numpy as np  
  
# 创建一个NumPy数组  
arr = np.array([3.16227766, 2.2, 7.81024968])  
  
# 使用 ** 操作符计算每个元素的平方  
squared_arr = arr ** 2  
  
# 或者使用 numpy.square() 函数  
squared_arr_using_function = np.square(arr)  
  
# 显示结果  
print(squared_arr)          # 输出: [ 1.   4.84  9.  ]  
print(squared_arr_using_function)  # 输出与上面相同

[10.          4.84       61.00000006]
[10.          4.84       61.00000006]


In [89]:
ssss = {1,2}
ssss.remove(5)

KeyError: 5

In [90]:
# 原始集合  
original_set = {1, 2, 3, 4, 5}  
  
# 要删除的元素  
element_to_remove = 3  
  
# 创建一个原始集合的副本  
modified_set = original_set.copy()  
  
# 从副本中删除元素  
modified_set.remove(element_to_remove)  
  
# 现在，original_set 保持不变，而 modified_set 是删除了指定元素后的集合  
print("Original Set:", original_set)  # 输出: Original Set: {1, 2, 3, 4, 5}  
print("Modified Set:", modified_set)  # 输出: Modified Set: {1, 2, 4, 5}  
  
# 如果你不确定元素是否存在，可以使用 discard() 方法  
# 它不会抛出异常，如果元素不存在则什么也不做  
element_might_not_exist = 6  
modified_set.discard(element_might_not_exist)  
  
# 再次打印 modified_set，它不会因为尝试删除不存在的元素而改变  
print("Modified Set after discard:", modified_set)  # 输出仍然是: {1, 2, 4, 5}

Original Set: {1, 2, 3, 4, 5}
Modified Set: {1, 2, 4, 5}
Modified Set after discard: {1, 2, 4, 5}
