import 需要的包

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
import random
from collections import Counter as count
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, auc, roc_curve, roc_auc_score


# ELA 探索性数据分析

In [None]:
train = pd.read_csv('data\\ads_train.csv')
test = pd.read_csv('data\\ads_test.csv')
pd.set_option('mode.chained_assignment', None)

In [None]:
test['y_buy']=0
train.columns
train.info()
train.describe()

# ELA完成，数据ETL
类别字段:
isbuyer,multiple_buy,multiple_visit,y_buy

id字段:
unique_urls

数值字段:
buy_freq,visit_freq、buy_interval、sv_interval、expected_time_buy、expected_time_visit、last_buy、last_visit、num_checkins

* y_buy 的平均值 0.004,说明数据集中类别失衡相当严重，只有很少一小部分样本是正样本. 


In [None]:
# 删除id字段，对类别字段转换为onehot编码
train,test = train.drop('uniq_urls',axis=1),test.drop('uniq_urls',axis=1)
train,test = train.drop('Unnamed: 0',axis=1),test.drop('Unnamed: 0',axis=1)
train,test = train.fillna(0),test.fillna(0)

categorical_features = ['isbuyer','multiple_buy','multiple_visit']
train_cat,test_cat = pd.get_dummies(train[categorical_features]),pd.get_dummies(test[categorical_features])
train,test = train.drop(categorical_features, axis=1),test.drop(categorical_features, axis=1)
train,test = pd.concat([train, train_cat], axis=1),pd.concat([test, test_cat], axis=1)

# 分离训练、测试集
train_X,test_X = train.drop('y_buy', axis=1),test.drop('y_buy', axis=1)
train_Y,test_Y = train['y_buy'],test['y_buy']

train_X.head()
test_X.head()



In [None]:
X_resample, Y_resample = RandomOverSampler(random_state=0).fit_resample(train_X, train_Y)

Y_resample.describe()

In [None]:
from sklearn.model_selection import train_test_split
 
X_train, X_val, Y_train, Y_val = train_test_split(X_resample,Y_resample, test_size=0.2)

In [None]:
# 中位数填充
from sklearn.impute import SimpleImputer
median_imputer=SimpleImputer(strategy='median')
median_imputer.fit(X_train)
median_imputer.fit(X_val)
median_imputer.fit(test_X)

# 按照范围进行比例归一化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scale_features = ['buy_freq','visit_freq','buy_interval','sv_interval','expected_time_buy','expected_time_visit','last_buy','last_visit','num_checkins']
X_train[scale_features] = scaler.fit_transform(X_train[scale_features])
X_val[scale_features] = scaler.fit_transform(X_val[scale_features])
test_X[scale_features] = scaler.fit_transform(test_X[scale_features])


In [None]:
X_train.head()



# ETL完成，实验可行性

尝试在进行整理过的数据上，进行逻辑回归

用库函数LR，看下效果

In [None]:
def plot_roc_curve(false_positive_rate, true_positive_rate, label=None):
    plt.plot(false_positive_rate, true_positive_rate, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'r', linewidth=4)
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive', fontsize=16)
    plt.ylabel('True Positive', fontsize=16)

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_val)
Y_scores = logreg.predict_proba(X_val)
Y_scores = Y_scores[:,1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_val,Y_scores)
plt.figure(figsize=(14, 7))
plot_roc_curve(false_positive_rate, true_positive_rate)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log

In [None]:
train_x = X_train.T
samples = train_x.shape[1]
train_y = Y_train.values.reshape((1,samples))
print(train_x.shape)
print(train_y.shape)

In [None]:
# define hyperparameters
learning_rate = 0.01
num_iter = 5000

# 手动计算普通LR模式

In [None]:
w = np.zeros(shape=(train_x.shape[0],1))
b = 0
print(train_y.shape,train_x.shape,w.shape)

def sigmoid(z):
    return 1/(1+np.exp(-z))

def FpBp(w, b, X, Y):
    """
    传参:
    w -- 权重 shape： (12,1)
    b -- 偏置项,shape(1,1)
    X -- 数据集，shape： (12,m),m为样本数
    Y -- 真实标签，shape： (1,m)
    """
    
    #获取样本数m：
    m = X.shape[1]
    # 前向传播 ：
    assert(w.T.shape[1]==X.shape[0])
    yhat = sigmoid(np.dot(w.T,X))  
    
    assert(yhat.shape == Y.shape)
    cross_entropy_loss = -(np.sum(Y*np.log(yhat)+(1-Y)*np.log(1-yhat)))/m                 
    
    # 反向传播：
    dZ = yhat-Y
    dw = (np.dot(X,dZ.T))/m
    db = (np.sum(dZ))/m  
    
    #返回值：
    return dw,db,cross_entropy_loss

    
loss = []


for i in range(num_iter):
    dw,db,cost = FpBp(w,b,train_x,train_y)
    w = w - learning_rate*dw
    b = b - learning_rate*db
    loss.append(cost)

plt.plot(loss)



In [None]:
def Predict(w,b,X):
    return sigmoid(np.dot(w.T,X))

pred_probs = Predict(w,b,X_val.T)
pred_labels = np.where(pred_probs>0.5,1,0)
diff = Y_val.values-pred_labels[0]
acc = 1-np.sum(np.abs(diff))/len(diff)
acc


测试集验证结果基本符合库函数的结果，实验视为成功

# FL cross-device模式

假设cross-device中心化模式，6万条数据分散于10个client（非IID分布）

FL按轮进行，每轮随机选中5个client，训练完成后global模型更新，global参数作为下一轮的起始参数进行下发。



![image.png](attachment:9f45df5a-9671-40e4-9526-4d33f4295c5b.png)

![image.png](attachment:1e21710b-4d98-4c14-9205-31bfa560582e.png)

In [None]:
train_X.shape

## 非IID切分数据

针对x的任意一个维度进行排序，然后顺序从排序后的记录中依次取值。这样得出的数据分布，和整体的数据分布就是非iid的关系。

ClientData模拟各个客户端持有的隐私数据。 

In [None]:
class Client:
    def __init__(self,id,train_x,train_y):
        self.client_id = id
        self.train_X = np.array(train_x).T
        self.train_Y = np.array(train_y).T
        self.sampleCount = self.train_X.shape[1]
        self.losses = []
        print("Client initialized with {} rows of data".format(self.sampleCount))

    def RunOptimization(self,w,b,iteration,learning_rate,showDetails=False):
        local_w = w
        local_b = b
        localDw = np.zeros(shape=w.shape)
        localDb = 0
        
        for i in range(iteration):
            m = self.train_X.shape[1]
            # 前向传播 ：
            assert(w.T.shape[1]==self.train_X.shape[0])
            yhat = sigmoid(np.dot(w.T,self.train_X))  
    
            assert(yhat.shape == self.train_Y.shape)
            cross_entropy_loss = -(np.sum(self.train_Y*np.log(yhat)+(1-self.train_Y)*np.log(1-yhat)))/m                 
    
            # 反向传播：
            dZ = yhat-self.train_Y
            dw = (np.dot(self.train_X,dZ.T))/m
            db = (np.sum(dZ))/m  
            local_w -= learning_rate*dw
            local_b -= learning_rate*db
            localDw += dw
            localDb += db
            self.losses.append(cross_entropy_loss)
            if showDetails:
                print("Client{},iter{} finished".format(self.client_id,i))
        if showDetails:
            print(localDw,localDb)
            
        return localDw,localDb

iid_y = pd.DataFrame(data=train_y.T,columns=["y_buy"])
iid_x = train_x.T
iid_union = [(iid_x.values[i],iid_y.values[i]) for i in range(iid_x.shape[0])]

iid_union.sort(key=lambda x:x[0][9])
noniid_x = [x[0] for x in iid_union]
noniid_y = [x[1] for x in iid_union]

np.array(noniid_x).T.shape

In [None]:
clients = [
    Client(1,noniid_x[:500],noniid_y[:500]),
    Client(2,noniid_x[500:6500],noniid_y[500:6500]),
    Client(3,noniid_x[6500:13000],noniid_y[6500:13000]),
    Client(4,noniid_x[13000:20000],noniid_y[13000:20000]),
    Client(5,noniid_x[20000:26000],noniid_y[20000:26000]),
    Client(6,noniid_x[26000:35000],noniid_y[26000:35000]),
    Client(7,noniid_x[35000:42000],noniid_y[35000:42000]),
    Client(8,noniid_x[42000:48000],noniid_y[42000:48000]),
    Client(9,noniid_x[48000:54000],noniid_y[48000:54000]),
    Client(10,noniid_x[54000:],noniid_y[54000:]),
]

创建控制类，持有所有客户端的列表。

和客户端之间按轮进行通信。每轮的工作如下：

- 随机从所有客户端中选择5个，下发当前global的w和b
- 命令客户端进行若干论FP和BP（FedAvg）
- 收集各个客户端更新的w和b梯度矩阵，聚合并更新global w和b
- 聚合和更新完成后，开启下一轮

In [None]:
class Coordinator:
    def __init__(self,clients,shape):
        self.clients = clients
        self.global_w = np.zeros(shape=(shape[0],1))
        self.global_b = 0
        self.lr = 0.01
        
    def ProcessRound(self,clientsNeeded,localIter,showDetails=False):
        selected_clients = random.sample(self.clients,clientsNeeded)
        grads = [x.RunOptimization(self.global_w,self.global_b,localIter,self.lr,showDetails) for x in selected_clients]
        grad_w = [x[0] for x in grads]      
        grad_b = [x[1] for x in grads]
        aggregated_dw = np.zeros(shape=(self.global_w.shape[0],1))
        aggregated_db = 0
        
        for i in range(clientsNeeded):
            aggregated_dw+=grad_w[i]
            aggregated_db+=grad_b[i]
        aggregated_dw/=clientsNeeded
        aggregated_db/=clientsNeeded
        self.global_w-=aggregated_dw*self.lr
        self.global_b-=aggregated_db*self.lr
    
    def FedAvg(self,rounds):
        for i in range(rounds):
            self.ProcessRound(5,5,False)
        
sample_data = np.zeros(shape=(12,1))
coordinator = Coordinator(clients,sample_data.shape)
coordinator.ProcessRound(1,1,True)
coordinator.FedAvg(200)

In [None]:
pred_probs = Predict(coordinator.global_w,coordinator.global_b,X_val.T)
pred_labels = np.where(pred_probs>0.5,1,0)
diff = Y_val.values-pred_labels[0]
acc = 1-np.sum(np.abs(diff))/len(diff)
acc

In [None]:
coordinator.global_w

In [None]:
plt.plot(coordinator.clients[1].losses)

In [None]:
plt.plot(coordinator.clients[4].losses)