# 问题转化为向量空间的数学运算

In [12]:
import tensorflow_hub as hub
import numpy as np

# load a pre-trained embedding
# Token based text embedding trained on Chinese Google News 100B corpus.
# https://tfhub.dev/google/nnlm-zh-dim50/2
embed = hub.load("https://tfhub.dev/google/nnlm-zh-dim50/2")

In [13]:
embed(["学生"]) # turn a string into a tensor 

<tf.Tensor: shape=(1, 50), dtype=float32, numpy=
array([[-4.61892923e-03,  3.14871520e-02, -1.37666896e-01,
        -1.80223718e-01,  3.18889290e-01,  1.59864619e-01,
        -1.30209308e-02, -6.74860328e-02,  2.50707179e-01,
         8.81165266e-02,  1.73361838e-01,  3.79935445e-05,
        -1.86411262e-01, -3.23447175e-02,  1.82978258e-01,
        -2.81624436e-01, -2.01185316e-01,  2.37850443e-01,
         4.68596071e-02,  1.18288822e-01, -1.80558890e-01,
         6.45632371e-02, -8.79279105e-04,  1.21993199e-01,
        -1.00037001e-01,  1.39083508e-02, -1.18444867e-01,
         7.55840838e-02, -6.39892668e-02, -9.03432146e-02,
        -5.55194207e-02,  1.13131203e-01, -4.65420913e-03,
        -1.44980639e-01,  2.39714831e-01,  9.70328003e-02,
         7.29109645e-02, -1.52734043e-02, -2.22533606e-02,
        -2.61341315e-02, -2.73455799e-01, -1.15428448e-01,
         9.27815959e-02, -5.35708964e-02, -2.38581806e-01,
        -1.06254369e-01, -1.78294197e-01, -1.30673364e-01,
       

In [14]:
def cos_sim(vector_a, vector_b):
    """
    计算两个向量之间的余弦相似度
    :param vector_a: 向量 a 
    :param vector_b: 向量 b
    :return: sim
    """
    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    
    cos = num / denom
    sim = 0.5 + 0.5 * cos
    
    return sim

In [15]:
def embeddings_cos_sim(ab):
    embeddings = embed(ab)
    B=embeddings.numpy()[1]
    A=embeddings.numpy()[0]
    print(ab, cos_sim(A, B))

In [11]:
embeddings_cos_sim(["猫","狗"])

['猫', '狗'] 0.9254584223578479


In [16]:
embeddings_cos_sim(["建筑设计","空间设计"])

['建筑设计', '空间设计'] 0.702155676317734


# 特征工程、机器学习

In [25]:
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn import tree

# Iris数据集是常用的分类实验数据集，
# 由Fisher, 1936收集整理。Iris也称鸢尾花卉数据集，
# 是一类多重变量分析的数据集。数据集包含150个数据集，
# 分为3类，每类50个数据，每个数据包含4个属性。
# 可通过花萼长度，花萼宽度，花瓣长度，花瓣宽度4个属性预测鸢尾花卉属于（Setosa，Versicolour，Virginica）三个种类中的哪一类。

#载入数据集
iris = datasets.load_iris()
#print(iris)
iris_data=iris['data']
#print(iris_data[0])

iris_label=iris['target']
#print(iris_label[0])
iris_target_name=iris['target_names']
print(iris_target_name)

X=np.array(iris_data)
Y=np.array(iris_label)

# print(X[0],iris_target_name[0])

# # #训练,决策树
model=tree.DecisionTreeClassifier(max_depth=3)
# # 开始训练
model.fit(X,Y)
 
# #这里预测当前输入的值的所属分类
# print('预测类别是',iris_target_name[clf.predict([[5,3,1,0.1]])[0]])

['setosa' 'versicolor' 'virginica']


DecisionTreeClassifier(max_depth=3)

In [26]:
k=[1,2,3,4]
k[0]

1

In [27]:
model.predict([ [1,3,0.5,6] ] )[0]
print('预测类别是',iris_target_name[0])

预测类别是 setosa


# 深度学习-表示学习，万物皆向量

In [None]:
#欧氏距离和余弦相似度
def dist_sim(vector_a, vector_b):
    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    dist = np.linalg.norm(vector_a - vector_b)  
    sim = 1.0 / (1.0 + dist) #归一化  
    return sim

In [None]:
dist_sim(A,B)

# 机器学习 分类
## 找出群体中的KOL
### 对比欧式距离与余弦相似度

In [None]:
#加载数据
import pandas as pd
df = pd.read_csv("data/students.csv")

In [None]:
#查看下数据的前几条
df.head()

In [None]:
#取需要的列
student=df.loc[1,['Name','Email','School','Major','grade','Interest','Interest level','Code']].values.tolist()

In [None]:
#加载 预训练模型
import tensorflow_hub as hub
embed = hub.load("model/nnlm-zh-dim50")

In [None]:
# 测试下
embeddings = embed(["".join(student)])[0]
embeddings.numpy()

In [None]:
#把特征转成 稠密向量
students=[]
for i in range(len(df)):
    #print(i)
    student=df.loc[i,['Email','School','Major','grade','Interest','Interest level','Code']].values.tolist()
    students.append(embed(["".join(student)])[0].numpy())
students=np.array(students)

In [None]:
#每位同学的稠密向量
students

In [None]:
#使用scikit learn的余弦相似度计算方法
from sklearn.metrics.pairwise import cosine_similarity
sim=cosine_similarity(students)
#查看下第一位与第二位同学的相似度
sim[0][1]

In [None]:
#为每位同学计算与他最相似的一位同学，只取相似度大于0.6的第一位同学
count_students={}
for i in range(len(students)):
    others=[]
    for j in range(len(students)):
        if i!=j:
            others.append({
                "index":j,
                "score":sim[i][j]
            })
    others=sorted(others, key=lambda x:x["score"],reverse=True)
    if others[0]['score']>0.6:
        print(df.loc[i,'Name'],df.loc[others[0]['index'],'Name'],others[0]['score']) 
        if not df.loc[others[0]['index'],'Name'] in count_students:
            count_students[df.loc[others[0]['index'],'Name']]=0
        count_students[df.loc[others[0]['index'],'Name']]+=others[0]['score']

In [None]:
# 不太可能是kol的同学（与其他同学相似性较低)
for i in range(len(students)):
    if not df.loc[i,'Name'] in count_students:
        print(df.loc[i,'Name'])

In [None]:
# 最有可能是kol的同学
sorted(count_students.items(), key=lambda x:x[1],reverse=True)

# 聚类算法
### 聚类班级学生

In [None]:
#引入库，并测试
from sklearn.cluster import KMeans,DBSCAN,Birch
import numpy as np

X = np.array([[1,2, 2], [1,2, 4], [1,2, 0],[4, 2,2], [4,2, 4], [4,1, 0]])

kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

kmeans.labels_ #输出原始数据的聚类后的标签值


In [None]:
#设置数量
model = KMeans(n_clusters =2)

In [None]:
#训练
model.fit(students)

In [None]:
for i in range(len(model.labels_)):
    if model.labels_[i]==0:
        print(df.loc[i,'Name'])
    

In [None]:
#换一种
model=DBSCAN(eps=0.11, min_samples=2).fit(students)
print(model.labels_)

In [None]:
#换一种
model = Birch(n_clusters=2)
model.fit(students)
print(model.labels_)

In [None]:
# 可视化查看模型学习到的分类
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2)
decomposition_data = tsne.fit_transform(students)

x = []
y = []
 
for i in decomposition_data:
    x.append(i[0])
    y.append(i[1])
    
plt.figure(figsize=(10, 10)) 
ax = plt.axes() 
plt.scatter(x, y, c=model.labels_, marker="x") 
plt.xticks(()) 
plt.yticks(()) 
plt.show()

# 深度学习 hello world
### 手写数字分类

In [None]:
%load_ext tensorboard

#引入相关库
import cv2
from matplotlib import pyplot as plt
import tensorflow as tf
import datetime

In [None]:
# 下载mnist数据集
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

In [None]:
# 数据集维度
print(x_test.shape,y_test.shape)

In [None]:
#查看下数据集
plt.imshow(x_test[1882])
plt.show()

In [None]:
#模型定义
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(10, activation='relu'),
  #tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
#训练开始
model.fit(x_train, y_train, epochs=5)
#评估模型
model.evaluate(x_test,  y_test, verbose=2)

# 颜色分类 v1.0

### 使用 Pandas 处理数据

In [31]:
#使用 Pandas 创建一个 dataframe
import pandas as pd
dataframe=pd.read_json("data/colorData.json", orient="records")

In [32]:
#预览下前面几条数据
dataframe.head()

Unnamed: 0,b,g,label,r,uid
0,155,183,green-ish,81,EjbbUhVExBSZxtpKfcQ5qzT7jDW2
1,71,22,pink-ish,249,fpqsSD6CvNNFQmRp9sJDdI1QJm32
2,33,196,orange-ish,254,fpqsSD6CvNNFQmRp9sJDdI1QJm32
3,237,147,blue-ish,170,fpqsSD6CvNNFQmRp9sJDdI1QJm32
4,225,159,blue-ish,15,fpqsSD6CvNNFQmRp9sJDdI1QJm32


In [33]:
#查看下数据类型
dataframe.dtypes

b         int64
g         int64
label    object
r         int64
uid      object
dtype: object

In [34]:
#label需要转成int
dataframe['label'] = pd.Categorical(dataframe['label'])
dataframe['label'] = dataframe.label.cat.codes

In [35]:
# 获取标签名称
#code--label
def get_label_name(label=0):
    labels=pd.Categorical(['brown-ish','blue-ish', 'green-ish', 'grey-ish', 'orange-ish',
           'pink-ish', 'purple-ish', 'red-ish', 'yellow-ish'])
    index=labels.codes.tolist().index(label)
    return labels.categories.tolist()[index]

In [36]:
# 测试下
get_label_name(3)

'grey-ish'

In [37]:
#查看标签分布
dataframe.loc[:, 'label'].value_counts()

2    1599
0    1218
6     823
5     511
7     412
1     319
8     275
4     259
3     227
Name: label, dtype: int64

In [38]:
#准备训练数据
dataframe = dataframe[['r','g','b','label']]

### 制作数据集

In [39]:
#分割成训练集、验证集、测试集
from sklearn.model_selection import train_test_split

df=dataframe.copy()

train, test = train_test_split(df, test_size=0.1)
train, val = train_test_split(train, test_size=0.1)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

4570 train examples
508 validation examples
565 test examples


In [40]:
# 一种从 Pandas Dataframe 创建 tf.data 数据集的实用程序方法（utility method）
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    x = dataframe.copy()
    x=x.astype('float64')
    y = x.pop('label')
    ds = tf.data.Dataset.from_tensor_slices((x.values, y.values))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [44]:
import datetime
import tensorflow as tf
#参数

#Batch Size：一次训练所选取的样本数。
BATCH_SIZE=64
#失活率(Dropout Rate) 每层中丢弃的神经元占整层神经元的比率
DROPOUT_RATE=0.1045
#轮次，整个输入数据的单次前向和反向传递
EPOCHS=100

log_dir = "logs/fit/DROPOUT_RATE_" + str(DROPOUT_RATE)+"_"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [45]:
train_ds=df_to_dataset(train,batch_size=BATCH_SIZE)
val_ds=df_to_dataset(val,batch_size=BATCH_SIZE)
test_ds=df_to_dataset(test,batch_size=BATCH_SIZE)

In [46]:
for f in train_ds.take(1):
    print(f)

(<tf.Tensor: shape=(64, 3), dtype=float64, numpy=
array([[104.,  18.,  73.],
       [219., 143.,  83.],
       [230.,  94.,   3.],
       [121.,  51.,  94.],
       [155., 172., 151.],
       [210.,  80., 233.],
       [113.,   7.,  39.],
       [ 93., 216., 122.],
       [220.,  36., 252.],
       [ 57., 169., 250.],
       [  7., 224.,  85.],
       [156., 106., 118.],
       [105., 222., 247.],
       [175.,  52.,  81.],
       [ 51., 105.,  47.],
       [ 55., 217.,  41.],
       [145.,  71.,  53.],
       [ 36., 190., 117.],
       [ 50., 185., 166.],
       [161., 189., 164.],
       [ 86., 148.,   3.],
       [ 66.,  25., 155.],
       [ 78., 213., 240.],
       [246.,  81., 175.],
       [157.,  51., 189.],
       [ 32., 147.,  82.],
       [191., 218., 127.],
       [ 53., 190., 112.],
       [ 21., 133., 135.],
       [212., 196.,  95.],
       [155., 195.,  54.],
       [ 15., 177., 212.],
       [  5., 186.,  44.],
       [107.,  25., 104.],
       [142., 249.,  63.],
     

### 模型

In [47]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(12, input_shape=(3,),activation='softplus'),
  tf.keras.layers.Dense(48, activation='relu'),
  tf.keras.layers.Dropout(DROPOUT_RATE),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(DROPOUT_RATE),
  tf.keras.layers.Dense(48, activation='relu'),
  tf.keras.layers.Dense(9, activation='softmax')
])

#optimizer=tf.keras.optimizers.SGD(learning_rate=0.25)

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 12)                48        
_________________________________________________________________
dense_1 (Dense)              (None, 48)                624       
_________________________________________________________________
dropout (Dropout)            (None, 48)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               6272      
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 48)                6192      
_________________________________________________________________
dense_4 (Dense)              (None, 9)                 4

In [48]:
model.fit(train_ds,
          validation_data=val_ds,
          epochs=EPOCHS,
             callbacks=[tensorboard_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x13ad2a850>

In [49]:
%tensorboard --logdir logs/fit

UsageError: Line magic function `%tensorboard` not found.


In [50]:
test_loss, test_acc = model.evaluate(test_ds, verbose=2)
print('\nTest accuracy:', test_acc)

9/9 - 0s - loss: 0.6121 - accuracy: 0.8142

Test accuracy: 0.8141592741012573


### 预测

In [51]:
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])

In [52]:
import numpy as np
predictions = probability_model.predict(np.array([[110,2,25]]))

In [53]:
predictions[0]

array([0.09855899, 0.10118753, 0.10212535, 0.09636237, 0.09603488,
       0.09689124, 0.09711929, 0.21332446, 0.09839591], dtype=float32)

In [54]:
get_label_name(np.argmax(predictions[0]))

'red-ish'

# RGB转HSV
### rgb->hsv

In [None]:
import cv2
# opencv
import numpy as np
from matplotlib import pyplot as plt

In [None]:
#创建一张黑色的图片
img = np.zeros((28,32), np.float32)
print(img.shape)
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
print(img.shape)
plt.imshow(img)
plt.show()

In [None]:
rgb_to_hsv([110,2,25])

In [None]:
'''
HSV颜色空间规定:H范围0~360,S范围0~1,V范围0~1 

PS中的HSV范围，H是0-360，S是0-1，V（B）是0-1

opencv中的HSV范围，H是0-180，S是0-255，V是0-255

'''

# h:0-360 , s:0-255, v:0-255
# r:0-255, g:0-255, b:0-255
def rgb_to_hsv(rgb=[]):
    img=np.array([[rgb]],np.uint8)
    #print(img)
    #print(img)
    plt.imshow(img)
    plt.show()
    img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    img_hsv[0][0]*=2
    #print(img_hsv)
    return img_hsv


def hsv_to_rgb(hsv=[]):
    hsv[0]/=2
    img_hsv=np.array([[hsv]],np.uint8)
    img_rgb = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB)
    plt.imshow(img_rgb)
    plt.show()
    return img_rgb

#rgb_to_hsv([250,0,0])
#hsv_to_rgb([360,255,255])

# 从h=0 开始旋转，每18°取一组颜色，作为配色方案
for i in range(0,360,18):
    #print(i)
    a=hsv_to_rgb([i,255,255])
    b=hsv_to_rgb([i+18,255,255])
    print(a,b)

In [None]:
#opencv读取图片，默认是BGR
img=cv2.imread('img/test.jpg',cv2.IMREAD_COLOR)
img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()