auto_ptr<int> ptr1(newint[100]); //declare a auto_ptr pointer toint *ptr1=123; //change value cout<<*ptr1<<endl; auto_ptr<int> ptr2(ptr1); //initial ptr2 via ptr1 and ptr1 trans its ownership if(ptr1.get()==NULL) cout<<"ptr1 has transfered ownership to ptr2"<<endl; cout<<*ptr2<<endl; return0;
------------------ output: 123 ptr1 has transfered ownership to ptr2 123
+
執行第二行的時候,ptr1會把所有權轉移給ptr2,所以此行一旦結束,ptr1就會是個null。
+
同樣的問題也會發生在assign的情況下
+
1 2 3
auto_ptr<int> ptr1(newint[100]); auto_ptr<int> ptr2; ptr2 = ptr1 ; //transfers ownership from ptr1 to ptr2
whileTrue: data = self.sock.recv(4096) if(data.find('PING'))!=-1: ##response to server avoid be kicked self.sock.send('PONG ' + data.split()[1]+'\r\n') elif(data.find('PRIVMSG'))!=-1: for module in self.modules: response = module.run(data) self.sock.send(" PRIVMSG "+channel + " :"+response+"\r\n") .....
public void addOFSwitchListener(IOFSwitchListener listener); 加入一個Switch Listener,這樣此module就可以聽取相關Switch 相關 event。 IOFSwitchListener 是一個Interface,必須實做該介面並且override下列function。 public void addedSwitch(IOFSwitch sw); public void removedSwitch(IOFSwitch sw); public void switchPortChanged(Long switchId); public String getName();
print_help() { echo"-o set the output file name. (default: out.png)" echo"-t set the graph type. (one of ‘filledcurve’, ‘lines’. default: ‘filledcurve’)" echo"set graph color. (in hexadecimal form, default: #1E90FF)" echo"set the number of point should use. (must be set. should be in range[60-600]" echo"Read LOGFILE environment variable. If it is not set, use /tmp/sysmonitor" }
# Parse the arguments whilegetopts"ho:t:c:n:" opt do case"$opt"in h) print_help; exit 1 ;; o) outName=$OPTARG ;; t) graphType=$OPTARG ;; c) graphColor=$OPTARG ;; n) pointNumber=$OPTARG ;; *) exit 1 ;; esac
done
# check graph type, which must be filledcurve or lines. if [ "$graphType" ] ; then if [ "$graphType" != "filledcurve" ] && [ "$graphType" != "lines" ] ; then echo"type should be one of 'filledcurve' and 'lines'." fi if [ "$graphType" == "filledcurve" ] ; then graphType="filledcurve y1=0" fi fi
# check graph color, wich must fit #[0-9a-f]{6} if [ "$graphColor" ] ; then tmp=`echo$graphColor | grep '^#[0-9a-f]\{6\}' ` if [ -z "$tmp" ] ; then echo"color format error." exit fi fi
# check point number range in 60 ~ 600 if [ -z $pointNumber ] || [ "$pointNumber" -lt 60 ] || [ "$pointNumber" -gt 600 ] ; then print_usage exit fi
# check input files's location
inputFile=`printenv LOGFILE`
#generate a reverse data tempInput="input2" `tail -r -n $pointNumber${inputFile:="/tmp/sysmonitor"} | awk '{ print -NR" "$1}' > $tempInput`
#generate a temp plt file tempFile="temp.plt" `touch $tempFile`
ovs-vsctl set bridge s1 protocols=OpenFlow13 ovs-vsctl set bridge s2 protocols=OpenFlow13 ovs-vsctl set bridge s3 protocols=OpenFlow13 ovs-vsctl set bridge s4 protocols=OpenFlow13 ovs-vsctl set bridge s5 protocols=OpenFlow13
r_left[i] = r_left[i-1]+1 if input[i-1] == 'red' or input[i-1] == 'white' 0 if input[i-1] == 'blue' b_left[i] = b_left[i-1]+1 if input[i-1] == 'blue' or input[i-1] == 'white' 0 if input[i-1] == 'red'
+
右邊部分則是計算包含I後往右的最大值。
1 2 3 4
r_right[i] = r_right[i+1]+1 if input[i] =='red' or input[i] =='white' 0 if input[i] =='blue' b_right[i] = b_right[i+1]+1 if input[i] =='blue' or input[i] =='white' 0 if input[i] =='red'
public ImmutablePort getPort(String name){ if (name == null) { thrownew NullPointerException("Port name must not be null"); } lock.readLock().lock(); try { return portsByName.get(name.toLowerCase()); } finally { lock.readLock().unlock(); } }
Since the Portmanager is a private member of IOFSwitch, you can’t directly use it. You must use the API provied by IOFSwitch to interact with Portmanager.
public IDebugCounter cntIncoming; cntIncoming = debugCounters.registerCounter(PACKAGE, "incoming", "All incoming packets seen by this module", CounterType.ALWAYS_COUNT);
我們令s1這個interface擁有三個ip,這些ip都代表每個network subnet的gatewayip,接下來為了讓switch自己幫我們處理所有arp request for gateway,我們加入下列flow entry到s1中
+
+
在mininet的環境中執行下列指令
+
sh ovs-ofctl add-flow s1 “table=0,priority=65535,arp,arp_tpa=10.0.0.254 actions=LOCAL”
+
sh ovs-ofctl add-flow s1 “table=0,priority=65535,arp,arp_tpa=20.0.0.254 actions=LOCAL”
+
sh ovs-ofctl add-flow s1 “table=0,priority=65535,arp,arp_tpa=30.0.0.254 actions=LOCAL”
+
+
上面這三個flow entry會把所有arp request for gateway的封包都導入本地的OS去處理,因此這些封包就會進入到 s1:0,s1:1,s1:2去處理,並且回覆一個arp reply。這些arp reply都會再度的進到OVS內,為了處理這些封包,我們要根據他的destination ip address把它給送回去對應的Host。
+
+
sh ovs-ofctl add-flow s1 “table=0,priority=1,arp,nw_dst=10.0.0.1,actions=output:1”
+
sh ovs-ofctl add-flow s1 “table=0,priority=1,arp,nw_dst=20.0.0.1,actions=output:2”
+
sh ovs-ofctl add-flow s1 “table=0,priority=1,arp,nw_dst=30.0.0.1,actions=output:3”
#Those two flow will handle the arp-request for the gateway, it will send the arp-request to s1 table=0,priority=65535,arp,arp_tpa=10.0.0.254 actions=LOCAL table=0,priority=65535,arp,arp_tpa=20.0.0.254 actions=LOCAL table=0,priority=65535,arp,arp_tpa=30.0.0.254 actions=LOCAL table=0,priority=1,arp,nw_dst=10.0.0.1,actions=output:1 table=0,priority=1,arp,nw_dst=20.0.0.1,actions=output:2 table=0,priority=1,arp,nw_dst=30.0.0.1,actions=output:3 table=0,priority=0,actions=resubmit(,1)
from mininet.net import Mininet from mininet.node import Controller, RemoteController, OVSController from mininet.node import CPULimitedHost, Host, Node from mininet.node import OVSKernelSwitch, UserSwitch from mininet.node import IVSSwitch from mininet.cli import CLI from mininet.log import setLogLevel, info from mininet.link import TCLink, Intf
defmyNetwork():
net = Mininet( topo=None, build=False, ipBase='10.0.0.0/8')
if __name__ == '__main__': setLogLevel( 'info' ) myNetwork()
+
###測試用的flow entries
+
1 2 3 4 5 6 7 8 9 10 11 12 13
#Those two flow will handle the arp-request for the gateway, it will send the arp-request to s1 table=0,priority=65535,arp,arp_tpa=10.0.0.254 actions=LOCAL table=0,priority=65535,arp,arp_tpa=20.0.0.254 actions=LOCAL table=0,priority=65535,arp,arp_tpa=30.0.0.254 actions=LOCAL table=0,priority=1,arp,nw_dst=10.0.0.1,actions=output:1 table=0,priority=1,arp,nw_dst=20.0.0.1,actions=output:2 table=0,priority=1,arp,nw_dst=30.0.0.1,actions=output:3 table=0,priority=0,actions=resubmit(,1)
Distributed Switch Architecture is a protocol for managing hardware switch chips. It consists of a set of MII management registers and commands to configure the switch, and an ethernet header format to signal which of the ports of the switch a packet was received from or is intended to be sent to.
/** * struct switchdev_ops - switchdev operations * * @switchdev_port_attr_get: Get a port attribute (see switchdev_attr). * * @switchdev_port_attr_set: Set a port attribute (see switchdev_attr). * * @switchdev_port_obj_add: Add an object to port (see switchdev_obj_*). * * @switchdev_port_obj_del: Delete an object from port (see switchdev_obj_*). * * @switchdev_port_obj_dump: Dump port objects (see switchdev_obj_*). */ structswitchdev_ops { int (*switchdev_port_attr_get)(struct net_device *dev, struct switchdev_attr *attr); int (*switchdev_port_attr_set)(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans); int (*switchdev_port_obj_add)(struct net_device *dev, const struct switchdev_obj *obj, struct switchdev_trans *trans); int (*switchdev_port_obj_del)(struct net_device *dev, const struct switchdev_obj *obj); int (*switchdev_port_obj_dump)(struct net_device *dev, struct switchdev_obj *obj, switchdev_obj_dump_cb_t *cb); };
/* There was a problem installing this route to the offload * device. For now, until we come up with more refined * policy handling, abruptly end IPv4 fib offloading for * for entire net by flushing offload device(s) of all * IPv4 routes, and mark IPv4 fib offloading broken from * this point forward. */
+
+
而目前在加入 rules 的部分,也有針對條件去篩選,並非所有的 FIB 都會被加入
1 2 3 4 5 6 7
#ifdef CONFIG_IP_MULTIPLE_TABLES if (fi->fib_net->ipv4.fib_has_custom_rules) return 0; #endif
if (fi->fib_net->ipv4.fib_offload_disabled) return 0;
int register_switchdev_notifier(struct notifier_block *nb); int unregister_switchdev_notifier(struct notifier_block *nb); int call_switchdev_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info);
0103structdrbd_transport { 0104structdrbd_transport_ops *ops; 0105structdrbd_transport_class *class; 0106 0107structlist_headpaths; 0108 0109constchar *log_prefix; /* resource name */ 0110structnet_conf *net_conf;/* content protected by rcu */ 0111 0112/* These members are intended to be updated by the transport: */ 0113unsignedint ko_count; 0114unsignedlong flags; 0115 };
0059structdtt_listener { 0060structdrbd_listenerlistener; 0061void (*original_sk_state_change)(struct sock *sk); 0062structsocket *s_listen; 0063 0064wait_queue_head_t wait; /* woken if a connection came in */ 0065 };
+
1 2 3 4 5 6 7 8 9 10 11 12 13
0204/* An "abstract base class" for transport implementations. I.e. it 0205 should be embedded into a transport specific representation of a 0206 listening "socket" */ 0207structdrbd_listener { 0208structkrefkref; 0209structdrbd_resource *resource; 0210structlist_headlist;/* link for resource->listeners */ 0211structlist_headwaiters;/* list head for paths */ 0212spinlock_t waiters_lock; 0213int pending_accepts; 0214structsockaddr_storagelisten_addr; 0215void (*destroy)(struct drbd_listener *); 0216 };
0085structdrbd_path { 0086structsockaddr_storagemy_addr; 0087structsockaddr_storagepeer_addr; 0088 0089structkrefkref; 0090 0091int my_addr_len; 0092int peer_addr_len; 0093bool established; /* updated by the transport */ 0094 0095structlist_headlist;/* paths of a connection */ 0096structlist_headlistener_link;/* paths waiting for an incomming connection, 0097 head is in a drbd_listener */ 0098structdrbd_listener *listener; 0099 };
下圖結構中可以觀察到有一個 transport,每條 connection 都會綁定一個 transport 的物件,該條 connection 的所有操作都依照該物件內容去執行,所以在此架構下,是可以做到每條 connection 使用不同的傳輸方法,譬如原生TCP或是自行實作的物件。
+
1 2 3 4 5 6 7 8 9 10
0904structdrbd_connection { 0905structlist_headconnections; .................. 1047 1048unsignedint peer_node_id; 1049structlist_headtwopc_parent_list; 1050structdrbd_transporttransport;/* The transport needs to be the last member. The acutal 1051 implementation might have more members than the 1052 abstract one. */ 1053 };
每一個 connection 會使用 double link list 的方式來維護多條 path,所以在 connetion 中會有一個 list head的物件來指向該 link list的第一個。 如下圖
+
Listener and Path
每一個 listener 都代表一個 listen socket,而不同的 path 則可以擁有相同的 listen socket,因為只會有其中一個真正被使用到,所以在架構上每個 listener 也會有一個 double link list 串起用到本身的 path。 如下圖
+
Resource and Conection/Listener
Resource是最上層的物件,掌管所有的 connection,因此也會使用 double link list去掌管所有的 connections。 此外,為了在某些步驟能夠更快速的查找所有的 listener, resource 本身也用了一個 double link list 串起所有的 resource。 將上述這些所有結果都繪成圖片,並且將所有的 double link 都簡化成 single link 且透過不同的箭頭符號表現不同的 link type。則結果會如下圖。
1050structdrbd_transporttransport;/* The transport needs to be the last member. The acutal 1051 implementation might have more members than the 1052 abstract one. */ 1053 };
0665staticboolconn_connect(struct drbd_connection *connection) 0666 { ................ 0675 start: 0676 have_mutex = false; 0677 clear_bit(DISCONNECT_EXPECTED, &connection->flags); 0678if (change_cstate(connection, C_CONNECTING, CS_VERBOSE) < SS_SUCCESS) { 0679/* We do not have a network config. */ 0680returnfalse; 0681 } 0682 0683/* Assume that the peer only understands protocol 80 until we know better. */ 0684 connection->agreed_pro_version = 80; 0685 0686 err = transport->ops->connect(transport); 0687if (err == -EAGAIN) { 0688if (connection->cstate[NOW] == C_DISCONNECTING) 0689returnfalse; 0690goto retry; 0691 } elseif (err < 0) { 0692 drbd_warn(connection, "Failed to initiate connection, err=%d\n", err); 0693gotoabort; 0694 }
+
接下來去設定每個 socket 的 send/recevie timeout,詳細的用途可以參考SO_RCVTIMEO and SO_SNDTIMEO。 不過這邊可以注意的是,因為這邊底層是走 linux socket 的方式,所以是走上述的方法去設定,若今天改走 RDMA 的話,作法就會完全不同。
+
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
0696 connection->last_received = jiffies; 0697 0698 rcu_read_lock(); 0699 nc = rcu_dereference(connection->transport.net_conf); 0700 ping_timeo = nc->ping_timeo; 0701 ping_int = nc->ping_int; 0702 rcu_read_unlock(); 0703 0704/* Make sure we are "uncorked", otherwise we risk timeouts, 0705 * in case this is a reconnect and we had been corked before. */ 0706 drbd_uncork(connection, CONTROL_STREAM); 0707 drbd_uncork(connection, DATA_STREAM); 0708 0709/* Make sure the handshake happens without interference from other threads, 0710 * or the challenge respons authentication could be garbled. */ 0711 mutex_lock(&connection->mutex[DATA_STREAM]); 0712 have_mutex = true; 0713 transport->ops->set_rcvtimeo(transport, DATA_STREAM, ping_timeo * 4 * HZ/10); 0714 transport->ops->set_rcvtimeo(transport, CONTROL_STREAM, ping_int * HZ);
可以清楚的看到,當第二個參數是 up 時,實際上會呼叫 adm_up 來進行後續的處理。 接下來看 adm_up 的介紹
+
adm_up
1 2 3 4 5 6 7 8 9 10 11
1974/* The "main" loop iterates over resources. 1975 * This "sorts" the drbdsetup commands to bring those up 1976 * so we will later first create all objects, 1977 * then attach all local disks, 1978 * adjust various settings, 1979 * and then configure the network part */ 1980staticintadm_up(const struct cfg_ctx *ctx) 1981 { ......... 2021return0; 2022 }
+
可以觀察到,這隻 function 負責超多事情,基本上就是幫你把 object/disk/network 都處理完畢。這邊我們專注於 Network 相關的處理。 首先先呼叫 set_peer_in_resource 進行處理
但是什麼叫做 提供網路能力, 這個部分我認為沒有定義,畢竟誰說網路一定是走 IPv4 ? 誰說網路一定要至少到 Layer3 IP 難道不能 point ot point 互連嗎? 主要是因為這部分的功能特性對於大部分的使用者都沒有需求,而目前最常見也是 IPv4 + TCP/UDP 的傳輸方式,因此才會看到大部分的 CNI 都在講這些。
但是什麼叫做 提供網路能力, 這個部分我認為沒有定義,畢竟誰說網路一定是走 IPv4 ? 誰說網路一定要至少到 Layer3 IP 難道不能 point ot point 互連嗎? 主要是因為這部分的功能特性對於大部分的使用者都沒有需求,而目前最常見也是 IPv4 + TCP/UDP 的傳輸方式,因此才會看到大部分的 CNI 都在講這些。
有在長期關注 SDN 的朋友們,可能有聽過 B4 這個名詞,甚至相關的論文 B4: Experience with a Globally-DeployedSoftware Defined WAN. 這是一篇 Google 於 2013 SIGCOMM 所發表關於 SDN 應用的一篇論文. 就我個人觀點來看,這是一篇劃時代且極具意義的論文,在那個 SDN 受到大量學界研究但是業界不看好的年代,由一個網路巨人 Google 站出來發表論文闡述將 SDN 的概念引進到 B4 這個跨國資料中心叢集中所帶來的好處及優點,無疑是對整個 SDN 的發展打入一劑強心針。
愈來愈多的 Datacenter Site 會增加 TE (Traffic Engineering) 計算的成本,導致計算路徑需要的時間更長。 原先 TE 的設計就是基於 Site-Level 的設計,因此 Site 的數量愈多, 計算路徑所需要的時間也相對愈多。原文使用了 super-linearly 來描述彼此的關係。 同時 TE 計算的時間增加,也會導致當有任何 data plane 發生問題時修復所需要的時間
隨者 Site 數量的增加,實際上也對底下的交換機產生了更大的壓力,因為交換機內部的傳輸規則表大小是有上限的。
最重要的問題就是,因為相鄰地區的site大幅度的增加Capacity的計算與規劃,對於應用程式開發者來說也產生的很大的困惑。基於 Site-Level 的設計下,卻有很多的 Site 是要服務相同地區的使用者,但是彼此又非常接近。
在原先的 B4 設計中, 所謂的 Capacity 計算與規劃主要是用來處理 site to site 之間的 WAN 頻寬計算
所以根據 (p.26) 的圖表,我們可以這樣解讀,現在該 site 中的 supernode 可以互相轉發流量,所以我們的 TE 最後還是可以傳送高達 14 單位的流量到第一個 site 裡面。 但是如果有超過 2 以上的單位流量傳輸到第四個 supernode 的話,因為該 supernode 本身只有 2 的對外流量,因此透過 sidelink 將多出來得流量都轉發到其他的 supernode 去處理,就能夠盡可能地利用所有 site to site 之間的傳輸頻寬。
上述的敘述中有一個沒有描述清楚的東西就是 TE 現在要如何去計算 sidelink 的容量?
因此 google 就提出了不同於之前的 site-level TE 的新算法,supernode-level TE.
有在長期關注 SDN 的朋友們,可能有聽過 B4 這個名詞,甚至相關的論文 B4: Experience with a Globally-DeployedSoftware Defined WAN. 這是一篇 Google 於 2013 SIGCOMM 所發表關於 SDN 應用的一篇論文. 就我個人觀點來看,這是一篇劃時代且極具意義的論文,在那個 SDN 受到大量學界研究但是業界不看好的年代,由一個網路巨人 Google 站出來發表論文闡述將 SDN 的概念引進到 B4 這個跨國資料中心叢集中所帶來的好處及優點,無疑是對整個 SDN 的發展打入一劑強心針。
愈來愈多的 Datacenter Site 會增加 TE (Traffic Engineering) 計算的成本,導致計算路徑需要的時間更長。 原先 TE 的設計就是基於 Site-Level 的設計,因此 Site 的數量愈多, 計算路徑所需要的時間也相對愈多。原文使用了 super-linearly 來描述彼此的關係。 同時 TE 計算的時間增加,也會導致當有任何 data plane 發生問題時修復所需要的時間
隨者 Site 數量的增加,實際上也對底下的交換機產生了更大的壓力,因為交換機內部的傳輸規則表大小是有上限的。
最重要的問題就是,因為相鄰地區的site大幅度的增加Capacity的計算與規劃,對於應用程式開發者來說也產生的很大的困惑。基於 Site-Level 的設計下,卻有很多的 Site 是要服務相同地區的使用者,但是彼此又非常接近。
在原先的 B4 設計中, 所謂的 Capacity 計算與規劃主要是用來處理 site to site 之間的 WAN 頻寬計算
所以根據 (p.26) 的圖表,我們可以這樣解讀,現在該 site 中的 supernode 可以互相轉發流量,所以我們的 TE 最後還是可以傳送高達 14 單位的流量到第一個 site 裡面。 但是如果有超過 2 以上的單位流量傳輸到第四個 supernode 的話,因為該 supernode 本身只有 2 的對外流量,因此透過 sidelink 將多出來得流量都轉發到其他的 supernode 去處理,就能夠盡可能地利用所有 site to site 之間的傳輸頻寬。
上述的敘述中有一個沒有描述清楚的東西就是 TE 現在要如何去計算 sidelink 的容量?
因此 google 就提出了不同於之前的 site-level TE 的新算法,supernode-level TE.
接下來為了讓外界的服務可以存取到該 Nginx Server (Ingress-Server),這邊會根據你的機器環境而有所不同。 以我 Baremetal 的環境,我需要部署下列的資源,透過 Kubernetes Service NodePort 的方式讓我的 Nginx Server 可以被外界存取
kubectl -n default get all NAME READY STATUS RESTARTS AGE pod/jupyter 1/1 Running 0 15h pod/nginx-7dd9f89db4-tvfkk 1/1 Running 0 15h pod/nginx-v2-5c45597f57-746p8 1/1 Running 0 15h
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE service/jupyter LoadBalancer 10.106.190.88 <pending> 80:32444/TCP 15h service/kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 89d service/nginx ClusterIP 10.110.237.87 <none> 80/TCP 15h service/nginx-v2 ClusterIP 10.103.43.44 <none> 80/TCP 15h
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE deployment.apps/nginx 1 1 1 1 15h deployment.apps/nginx-v2 1 1 1 1 15h
NAME DESIRED CURRENT READY AGE replicaset.apps/nginx-7dd9f89db4 1 1 1 15h replicaset.apps/nginx-v2-5c45597f57 1 1 1 15h
上述服務完畢後,先用 curl 針對三個 service 的 ClusterIP 去確認服務有正常起來
因為我們的 Nginx Server 是基於 NorePort 的方式來供對外存取,所以我們要先確認一下開啟的NodePort資訊是什麼
1 2 3
$ kubectl -n ingress-nginx get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE ingress-nginx NodePort 10.111.134.97 <none> 80:32663/TCP,443:31309/TCP 1d
接下來為了讓外界的服務可以存取到該 Nginx Server (Ingress-Server),這邊會根據你的機器環境而有所不同。 以我 Baremetal 的環境,我需要部署下列的資源,透過 Kubernetes Service NodePort 的方式讓我的 Nginx Server 可以被外界存取
kubectl -n default get all NAME READY STATUS RESTARTS AGE pod/jupyter 1/1 Running 0 15h pod/nginx-7dd9f89db4-tvfkk 1/1 Running 0 15h pod/nginx-v2-5c45597f57-746p8 1/1 Running 0 15h
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE service/jupyter LoadBalancer 10.106.190.88 <pending> 80:32444/TCP 15h service/kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 89d service/nginx ClusterIP 10.110.237.87 <none> 80/TCP 15h service/nginx-v2 ClusterIP 10.103.43.44 <none> 80/TCP 15h
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE deployment.apps/nginx 1 1 1 1 15h deployment.apps/nginx-v2 1 1 1 1 15h
NAME DESIRED CURRENT READY AGE replicaset.apps/nginx-7dd9f89db4 1 1 1 15h replicaset.apps/nginx-v2-5c45597f57 1 1 1 15h
上述服務完畢後,先用 curl 針對三個 service 的 ClusterIP 去確認服務有正常起來
因為我們的 Nginx Server 是基於 NorePort 的方式來供對外存取,所以我們要先確認一下開啟的NodePort資訊是什麼
1 2 3
$ kubectl -n ingress-nginx get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE ingress-nginx NodePort 10.111.134.97 <none> 80:32663/TCP,443:31309/TCP 1d
apiVersion:certmanager.k8s.io/v1alpha1 kind:ClusterIssuer metadata: name:cert-demo namespace:default spec: acme: #server: https://acme-v02.api.letsencrypt.org/directory server:https://acme-staging-v02.api.letsencrypt.org/directory email:your@mail.com # Name of a secret used to store the ACME account private key privateKeySecretRef: name:cert-demo # ACME DNS-01 provider configurations dns01: # Here we define a list of DNS-01 providers that can solve DNS challenges providers: - name:cf-dns cloudflare: email:your@mail.com # A secretKeyRef to a cloudflare api key apiKeySecretRef: name:cloudflare key:api
..... Status: Acme: Uri: https://acme-staging-v02.api.letsencrypt.org/acme/acct/7037688 Conditions: Last Transition Time: 2018-09-30T16:51:03Z Message: The ACME account was registered with the ACME server Reason: ACMEAccountRegistered Status: True Type: Ready Events: <none>
確認這邊是 True/Ready 之後, 對於 Issuers 的事情就告一段落了
Certificates
正式進入 Yaml 之前,有幾個重要的事情要先注意
每個 Certificate 可以設定 Subject Alternative Names(SANs), 亦即可以設定多個 domain name.
針對設定的所有 domain name, Issuers 都必須要去處理。實際上在 Issuers 的設定中,可以設定多種方法,甚至是同時支援多個 DNS 供應商。
Events: Type Reason Age From Message ---- ------ ---- ---- ------- Normal CreateOrder 16s cert-manager Created new ACME order, attempting validation... Normal IssueCert 15s cert-manager Issuing certificate... Normal CertObtained 13s cert-manager Obtained certificate from ACME server Normal CertIssued 13s cert-manager Certificate issued successfully
apiVersion:certmanager.k8s.io/v1alpha1 kind:ClusterIssuer metadata: name:cert-demo namespace:default spec: acme: #server: https://acme-v02.api.letsencrypt.org/directory server:https://acme-staging-v02.api.letsencrypt.org/directory email:your@mail.com # Name of a secret used to store the ACME account private key privateKeySecretRef: name:cert-demo # ACME DNS-01 provider configurations dns01: # Here we define a list of DNS-01 providers that can solve DNS challenges providers: - name:cf-dns cloudflare: email:your@mail.com # A secretKeyRef to a cloudflare api key apiKeySecretRef: name:cloudflare key:api
..... Status: Acme: Uri: https://acme-staging-v02.api.letsencrypt.org/acme/acct/7037688 Conditions: Last Transition Time: 2018-09-30T16:51:03Z Message: The ACME account was registered with the ACME server Reason: ACMEAccountRegistered Status: True Type: Ready Events: <none>
確認這邊是 True/Ready 之後, 對於 Issuers 的事情就告一段落了
Certificates
正式進入 Yaml 之前,有幾個重要的事情要先注意
每個 Certificate 可以設定 Subject Alternative Names(SANs), 亦即可以設定多個 domain name.
針對設定的所有 domain name, Issuers 都必須要去處理。實際上在 Issuers 的設定中,可以設定多種方法,甚至是同時支援多個 DNS 供應商。
Events: Type Reason Age From Message ---- ------ ---- ---- ------- Normal CreateOrder 16s cert-manager Created new ACME order, attempting validation... Normal IssueCert 15s cert-manager Issuing certificate... Normal CertObtained 13s cert-manager Obtained certificate from ACME server Normal CertIssued 13s cert-manager Certificate issued successfully
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`iptables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`iptables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
Setup iptables
不同於 ebtables,在 iptables 這邊的修改比較多,原因如下
此情境屬於 Wan To Container, 這意味牽扯到不同網段的傳輸
因為我操作的環境算是很乾淨,所以我針對 Wan IP 以及 Container IP 來作為封包的條件
check() { count=`iptables-save | grep wtc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
Test
在測試方面,我一開始本來是採用 curl 的方式去連線 nginx 容器,但是其實 curl 做了太多事情了,除了一開始的 TCP 三方交握連線外,還包含了 HTTP GET。 對於我們只想要單純觀察 Wan To Controller 這來回的連線來說,這其實做了太多事情了。
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`iptables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`iptables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
Setup iptables
不同於 ebtables,在 iptables 這邊的修改比較多,原因如下
此情境屬於 Wan To Container, 這意味牽扯到不同網段的傳輸
因為我操作的環境算是很乾淨,所以我針對 Wan IP 以及 Container IP 來作為封包的條件
check() { count=`iptables-save | grep wtc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
Test
在測試方面,我一開始本來是採用 curl 的方式去連線 nginx 容器,但是其實 curl 做了太多事情了,除了一開始的 TCP 三方交握連線外,還包含了 HTTP GET。 對於我們只想要單純觀察 Wan To Controller 這來回的連線來說,這其實做了太多事情了。
before_install runs before the install step, which is meant to install any required packages or dependencies. You can prepare things before you run this step, or you can e.g. run sudo apt-get update to refresh the apt indexes.
before_script runs before the actual test/build script runs. It’s commonly used to run any preparation steps required to get the build running, for instance copy database configurations, set up any additional environment configuration, and so on.
before_install runs before the install step, which is meant to install any required packages or dependencies. You can prepare things before you run this step, or you can e.g. run sudo apt-get update to refresh the apt indexes.
before_script runs before the actual test/build script runs. It’s commonly used to run any preparation steps required to get the build running, for instance copy database configurations, set up any additional environment configuration, and so on.
]]>
@@ -393,7 +393,7 @@
2018-08-31T14:09:05.000Z2018-09-16T15:29:32.897Z
- 在探討整個主題之前,我們先設計一個簡單的 schema 來符合這次 假設今天在資料庫內有主要結構,分別是 User 跟 Pod User 非常簡單,就是描述一個使用者 Pod 這邊不用在意他是什麼東西,他是一個資源,然後透過 User 創立的,所以每個 Pod 裡面都會有一個欄位去記錄是哪個 User 創立的。
]]>
+ 在探討整個主題之前,我們先設計一個簡單的 schema 來符合這次 假設今天在資料庫內有主要結構,分別是 User 跟 Pod User 非常簡單,就是描述一個使用者 Pod 這邊不用在意他是什麼東西,他是一個資源,然後透過 User 創立的,所以每個 Pod 裡面都會有一個欄位去記錄是哪個 User 創立的。
我們用下列指令確認一下剛剛部屬的 kubernetes service 是否真的有設定 sessionAffinity
1 2 3
vortex-dev:01:40:58 [~/go/src/github.com/hwchiu/kubeDemo/services](master)vagrant $kubectl get service k8s-nginx-affinity -o jsonpath='{.spec.sessionAffinity}' ClientIP
我們用下列指令確認一下剛剛部屬的 kubernetes service 是否真的有設定 sessionAffinity
1 2 3
vortex-dev:01:40:58 [~/go/src/github.com/hwchiu/kubeDemo/services](master)vagrant $kubectl get service k8s-nginx-affinity -o jsonpath='{.spec.sessionAffinity}' ClientIP
vortex-dev:03:34:14 [~]vagrant $kubectl get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE k8s-nginx-node NodePort 10.98.128.179 <none> 80:30136/TCP 1d
vortex-dev:03:43:42 [~]vagrant $sudo iptables-save | grep "\-j KUBE-NODEPORTS" -A KUBE-SERVICES -m comment --comment "kubernetes service nodeports; NOTE: this must be the last rule in this chain" -m addrtype --dst-type LOCAL -j KUBE-NODEPORTS
vortex-dev:03:34:14 [~]vagrant $kubectl get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE k8s-nginx-node NodePort 10.98.128.179 <none> 80:30136/TCP 1d
vortex-dev:03:43:42 [~]vagrant $sudo iptables-save | grep "\-j KUBE-NODEPORTS" -A KUBE-SERVICES -m comment --comment "kubernetes service nodeports; NOTE: this must be the last rule in this chain" -m addrtype --dst-type LOCAL -j KUBE-NODEPORTS
vortex-dev:05:43:54 [~/go/src/github.com/hwchiu/kubeDemo](master)vagrant $kubectl get endpoints k8s-nginx-cluster NAME ENDPOINTS AGE k8s-nginx-cluster 10.244.0.88:80,10.244.0.89:80,10.244.0.90:80 1d
vortex-dev:05:43:54 [~/go/src/github.com/hwchiu/kubeDemo](master)vagrant $kubectl get endpoints k8s-nginx-cluster NAME ENDPOINTS AGE k8s-nginx-cluster 10.244.0.88:80,10.244.0.89:80,10.244.0.90:80 1d
本篇文章主要的概念是閱讀筆記, 主要是針對 Google 於 2018 Sigcomm 所發表關於 SD-WAN 的相關論文,這篇論文非常直得一看的點是這篇論文算是 2013 Sigcomm B4 論文後的後續,講述了 SDN 概念引進 B4 帶來的好處以及這幾年因應環境變化而該 B4 資料中心的成長,其中包含了眾多的問題以及處理的方式,著實非常有趣,能夠學習到更多的想法與概念
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Preface
有在長期關注 SDN 的朋友們,可能有聽過 B4 這個名詞,甚至相關的論文 B4: Experience with a Globally-Deployed
+Software Defined WAN. 這是一篇 Google 於 2013 SIGCOMM 所發表關於 SDN 應用的一篇論文. 就我個人觀點來看,這是一篇劃時代且極具意義的論文,在那個 SDN 受到大量學界研究但是業界不看好的年代,由一個網路巨人 Google 站出來發表論文闡述將 SDN 的概念引進到 B4 這個跨國資料中心叢集中所帶來的好處及優點,無疑是對整個 SDN 的發展打入一劑強心針。
愈來愈多的 Datacenter Site 會增加 TE (Traffic Engineering) 計算的成本,導致計算路徑需要的時間更長。 原先 TE 的設計就是基於 Site-Level 的設計,因此 Site 的數量愈多, 計算路徑所需要的時間也相對愈多。原文使用了 super-linearly 來描述彼此的關係。 同時 TE 計算的時間增加,也會導致當有任何 data plane 發生問題時修復所需要的時間
+
隨者 Site 數量的增加,實際上也對底下的交換機產生了更大的壓力,因為交換機內部的傳輸規則表大小是有上限的。
+
最重要的問題就是,因為相鄰地區的site大幅度的增加Capacity的計算與規劃,對於應用程式開發者來說也產生的很大的困惑。基於 Site-Level 的設計下,卻有很多的 Site 是要服務相同地區的使用者,但是彼此又非常接近。
+
+
在原先的 B4 設計中, 所謂的 Capacity 計算與規劃主要是用來處理 site to site 之間的 WAN 頻寬計算
所以根據 (p.26) 的圖表,我們可以這樣解讀,現在該 site 中的 supernode 可以互相轉發流量,所以我們的 TE 最後還是可以傳送高達 14 單位的流量到第一個 site 裡面。 但是如果有超過 2 以上的單位流量傳輸到第四個 supernode 的話,因為該 supernode 本身只有 2 的對外流量,因此透過 sidelink 將多出來得流量都轉發到其他的 supernode 去處理,就能夠盡可能地利用所有 site to site 之間的傳輸頻寬。
+
上述的敘述中有一個沒有描述清楚的東西就是 TE 現在要如何去計算 sidelink 的容量?
+
因此 google 就提出了不同於之前的 site-level TE 的新算法,supernode-level TE.
blktrace is a block layer IO tracing mechanism which provide detailed information about request queue operations up to user space.
+
blkparse will combine streams of events for various devices on various CPUs, and produce a formatted output the the event information. It take the output of above tool blktrace and convert those information into fency readable form.
+
In the following, We will use those tools blktrace and blkparse to help us to observe sector numbers which has been written by fio requests. We will use the fil to generate two diffenrt IO pattern requests, sequence write and random write.
After we setup the fio config, use the fio to generate the IO request. In this example, we ask the fio to generate the IO via sequence write pattern.
1
fio ${path_of_config} section=sw
+
During the experiment, you can use the tool iostat to monitor the I/O information about the device we want to observe.
+
Step2
Open other terminal and use blktrace to collection the data, there are two parameter we need to use, First one is -d, which indicate what target device blktrace will monitor to. Second, is -w, we use it to limit the time (seconds) how long blktrace will run. So, our final command looks like below.
1
blktrace -d /dev/nvme1n1 -w 60
+
In the end of blktrace, you can discover some new files has created by blktrace and its prefix name is nvme1n1.blktrac.xx The number of files is depends how may CPUs in your system.
+
1 2 3 4 5 6 7 8 9 10 11 12 13
-rw-r--r-- 1 root root 821152 Jun 2 10:39 nvme1n1.blktrace.0 -rw-r--r-- 1 root root 21044368 Jun 2 10:39 nvme1n1.blktrace.1 -rw-r--r-- 1 root root 462864 Jun 2 10:39 nvme1n1.blktrace.10 -rw-r--r-- 1 root root 737960 Jun 2 10:39 nvme1n1.blktrace.11 -rw-r--r-- 1 root root 865872 Jun 2 10:39 nvme1n1.blktrace.12 -rw-r--r-- 1 root root 755248 Jun 2 10:39 nvme1n1.blktrace.13 -rw-r--r-- 1 root root 4675176 Jun 2 10:39 nvme1n1.blktrace.14 -rw-r--r-- 1 root root 4471480 Jun 2 10:39 nvme1n1.blktrace.15 -rw-r--r-- 1 root root 5070264 Jun 2 10:39 nvme1n1.blktrace.16 -rw-r--r-- 1 root root 5075040 Jun 2 10:39 nvme1n1.blktrace.17 -rw-r--r-- 1 root root 5062104 Jun 2 10:39 nvme1n1.blktrace.18 -rw-r--r-- 1 root root 5586936 Jun 2 10:39 nvme1n1.blktrace.19 -rw-r--r-- 1 root root 3718848 Jun 2 10:39 nvme1n1.blktrace.2
+
Step3
Now, we can use the blkparse to regenerate human-readable output form the output we get via blktrace before.
+
We need to indicate source files, you can just use the device name without .blktrace.xx, for example, nvmen1, it will search all files which match the pattern nvmen1.blktrace.xx and put together to analyze. Then, the -f option used to foramt the output data, you can find more about it via man blkparse
OUTPUT DESCRIPTION AND FORMATTING The output from blkparse can be tailored for specific use -- in particular, to ease parsing of output, and/or limit output fields to those the user wants to see. The data for fields which can be output include:
a Action, a (small) string (1 or 2 characters) -- see table below for more details
c CPU id
C Command
d RWBS field, a (small) string (1-3 characters) -- see section below for more details
D 7-character string containing the major and minor numbers of the event's device (separated by a comma).
e Error value
m Minor number of event's device.
M Major number of event's device.
n Number of blocks
N Number of bytes
p Process ID
P Display packet data -- series of hexadecimal values
s Sequence numbers
S Sector number t Time stamp (nanoseconds)
T Time stamp (seconds)
u Elapsed value in microseconds (-t command line option)
U Payload unsigned integer
+
For our observation, we use %5T.%9t, %p, %C, %a, %S\n to format our result containing timestamp, command, process ID, action and sequence number.
+
Since the data I/O contains many action, such as complete, queued, inserted..ect. we can use option -a to filter actions, you can find more info via man blktrace. In this case, we use the write to filter the actions.
+
In the end, use the -o options to indicate the output file name.
Since the fio will fork to two process to handle the process, we use the grep to focus on one specific process (pid=22892).
+
1
grep "22892, fio" output.txt | more
+
Now, the result seems good, we can discover the sequence number (fifth column) is increasing. One thing we need to care about is the row which action is “C”, which means the completed, since we don’t know how NVME handle those request and reply to upper layer. we only need to focus on other action. such as “Q (queued This notes intent to queue i/o at the given location. No real requests exists yet.)” or “I (inserted A request is being sent to the i/o scheduler for addition to the internal queue and later service by the driver. The request is fully formed at this time)”.
0.000147436, 22892, fio, C, 1784576 0.000188830, 22892, fio, Q, 1817728 0.000189783, 22892, fio, G, 1817728 0.000191405, 22892, fio, I, 1817728 0.000192830, 22892, fio, D, 1817728 0.000227655, 22892, fio, Q, 1817856 0.000228457, 22892, fio, G, 1817856 0.000231936, 22892, fio, I, 1817856 0.000233530, 22892, fio, D, 1817856 0.000360361, 22892, fio, Q, 1817984 0.000361310, 22892, fio, G, 1817984 0.000364163, 22892, fio, I, 1817984 0.000366696, 22892, fio, D, 1817984 0.000536731, 22892, fio, Q, 1818112 0.000537758, 22892, fio, G, 1818112 0.000539371, 22892, fio, I, 1818112 0.000541407, 22892, fio, D, 1818112 0.000670209, 22892, fio, Q, 1818240 0.000671345, 22892, fio, G, 1818240 0.000673383, 22892, fio, I, 1818240 0.000676260, 22892, fio, D, 1818240 0.001885543, 22892, fio, Q, 1818368 0.001887444, 22892, fio, G, 1818368 0.001891353, 22892, fio, I, 1818368 0.001895917, 22892, fio, D, 1818368 0.001934546, 22892, fio, Q, 1818496 0.001935468, 22892, fio, G, 1818496 0.001936891, 22892, fio, I, 1818496 0.001938742, 22892, fio, D, 1818496 0.001965818, 22892, fio, Q, 1818624
+
Now, we can do all above command again and change the section to rw for fio using the randon write pattern. The blkparse result will show the random sequence number.
+
Summary
In this article, we try to use tools blktrace and blkparse to analysiz the block level I/O for fio request. We observe the filed sequence number to make sure thhat the fio can generate the sequence or random according to its config.
BlueZ is official Linux Bluetooth protocol stack. It is an Open Source project distributed under GNU General Public License (GPL). BlueZ kernel is part of the official Linux kernel since version 2.4.6.
1.下指令 rake setup_github_pages ,噴出錯誤訊息 rake aborted! You have already activated rake 10.0.2, but your Gemfile requires rake 0.9.2.2. Using bundle exec may solve this….
+
+
修改Gemfile 這個檔案,手動改版本
+
+
2.rake setup_github_pages,輸入url後噴出找不到的訊息 rake aborted! no such file or directory - git -remote -v
+
+
先把git的路徑加入倒環境變數PATH中,在使用windows內建的shell來執行相關指令 reference this
+
+
Upload
因為文章中若含有中文,必須要設定環境變數,加上每次上傳都要先產生文章,在更新上去,懶惰的我就寫了一個batch來使用 batch.bat set LANG=zh_TW.UTF-8 set LC_ALL=zh_TW.UTF-8 bundle exec rake generate & bundle exec rake deploy
int main(){
+ int rand1,rand2;
+ int fd[2];// declare a two-d array, store file_descriptor of the pipe (two side)
+ // fd[0] mease read side, fd[1] means write side
+ pid_t pid;//child process的pid
+ pipe(fd); //call system call (pipe) to create a pipe
+ //use fork to create a child process
+ //child process will wrtie data to pipe, and parent will read data from pipe
+ //child process
+ if((pid=fork())==0){
+ srand(getpid());
+ close(fd[READ_END]);//child won't use read size, so close it
+ rand1=rand()%RANGE; //create random number
+ write(fd[WRITE_END],&rand1,sizeof(rand1)); //write to pipe
+ close(fd[WRITE_END]);//close write side when write over
+ printf("%d has been created In Child Process \n",rand1);
+ exit(1);
+ }
+ else if(pid>0){
+ srand(getpid());
+ close(fd[WRITE_END]);//parent won't use write size, so close it。
+ rand2=rand()%RANGE;//create random number
+ read(fd[READ_END],&rand1,sizeof(rand1));//read the data from pipe
+ printf("%d has been created In Parent Process \n",rand2);
+ wait();
+ printf("Parent Process calulate sum is :%d \n",rand1+rand2);
+ close(fd[READ_END]);//close read side
+ exit(1);
+ }
+return 0;
+}
+
0085/** 0086 * The throttler is used to limit how much data is held by Messages from 0087 * the associated Connection(s). When reading in a new Message, the Messenger 0088 * will call throttler->throttle() for the size of the new Message. 0089 */ 0090 Throttle *throttler_bytes; 0091 Throttle *throttler_messages;
+
+
這個 function 檢查是否有 throttle 訊息的數量限制,若有限制且超過上限,則建立一個 time event,並儲存下來。
1856void AsyncConnection::_connect() 1857 { 1858 ldout(async_msgr->cct, 10) << __func__ << " csq=" << connect_seq << dendl; 1859 1860 state = STATE_CONNECTING; 1861// rescheduler connection in order to avoid lock dep 1862// may called by external thread(send_message) 1863 center->dispatch_event_external(read_handler); 1864 }
0610/** 0611 * Notify each Dispatcher of a new Connection. Call 0612 * this function whenever a new Connection is initiated or 0613 * reconnects. 0614 * 0615 * @param con Pointer to the new Connection. 0616 */ 0617voidms_deliver_handle_connect(Connection *con){ 0618for (list<Dispatcher*>::iterator p = dispatchers.begin(); 0619 p != dispatchers.end(); 0620 ++p) 0621 (*p)->ms_handle_connect(con); 0622 }
+
1 2 3 4 5 6 7 8 9 10
0110/** 0111 * This function will be called synchronously whenever a Connection is 0112 * newly-created or reconnects in the Messenger, if you support fast 0113 * dispatch. It is guaranteed to be called before any messages are 0114 * dispatched. 0115 * 0116 * @param con The new Connection which has been established. You are not 0117 * granted a reference to it -- take one if you need one! 0118 */ 0119virtualvoidms_handle_fast_connect(Connection *con){}
Data Plane Development Kit(DPDK)是一套 intel所主導的技術,基本上就是使用CPU換取效能的機制,藉由此機制,user-space的程式可以略過 kernel 直接跟硬體溝通,這部分採用的是polling的方式,藉由不停地詢問來加強整理的效能,然而也會因為一直不停的polling使得cpu使用率提升。 然而此技術只要是x86的 CPU 基本上都支援,所以在使用上其實可以說是非常的廣泛,很容易被支援,不太會有被特定硬體綁住限制的機會。
+
RDMA
RDMA代表的是遠方記憶體存取,是一種擁有low latency, low cpu consumption, kernel by pass等特性的傳輸方法,藉由跳過kernel-space的方式,讓整體資料流量直接透過網卡去處理,同時也可以直接針對遠方的記憶體進行修改而不會讓遠方的CPU察覺這件事情。 一般來說支援的網路底層有 Infiniband 以及 Ethernet,這部分由於封包會忽略 kernel space,因此資料在 internet 上傳遞勢必要符合當前廣泛的格式,譬如 Ethernet,因此這邊會採用 ROCE 的方式來處理封包之間的標頭問題。 目前 ceph 上面已經可以運行 RDMA,不過根據開發者在 Mail 中的說法,目前還是在尋求穩定性為主,效能上還沒有達到最佳化的部分,因此使用起來與 POSIX w/ TCP/IP 在各方面都不會有太明顯的提升。
0068/* 0069 * EventDriver is a wrap of event mechanisms depends on different OS. 0070 * For example, Linux will use epoll(2), BSD will use kqueue(2) and select will 0071 * be used for worst condition. 0072 */ 0073classEventDriver { 0074public: 0075virtual ~EventDriver() {} // we want a virtual destructor!!! 0076virtualintinit(EventCenter *center, int nevent)= 0; 0077virtualintadd_event(int fd, int cur_mask, int mask)= 0; 0078virtualintdel_event(int fd, int cur_mask, int del_mask)= 0; 0079virtualintevent_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp)= 0; 0080virtualintresize_events(int newsize)= 0; 0081virtualboolneed_wakeup(){ returntrue; } 0082 };
RDMA (Remote Direct Memory Access) is a mechanism which allow the host to accessing(read, write) memory on a remote host without interrupting CPU.
+
The advantage of RDMA
+
Zero-copy
+
Kernel bypass
+
No CPU involvement`
+
+
+
+
+
With RDMA, our data can transfer without the involvement of the linux kernel network stack and provide hight performance, low latency, low CPU consumption.
+
This article focus on how to enable the ceph with RDMA, including how to install ceph and enable the RDMA function.
+
+
+
Install
+
I introduce two ways to install the ceph with RDMA, one is use widly used tool ceph-deploy and the other is manually build the ceph.
+
+
ceph-deploy
+
If you use the ceph-deploy to install the ceph, you must make sure the source package you installed is configure with -DWITH_RDMA=ON.
+
You can use the argument –dev and -dev–commit to select the source packet form the official ceph build phase.
+
you can find those avaliabe repos in the ceph site
+
choose the one you want to install and clink it into the next page, you will see something like this Repos ceph > wip-jd-testing > da2c3dabdad80c01ec3d3258b51640cc0a93e842 > default
+
wip-jd-testing is for –dev and da2c3… is for –dev-commit.
+
use the following command to install the ceph from above repos.
If you want to use the systemd to manage the ceph daemons, you should modify the systemd config to make it support RDMA because of the default config will fail for some access permission problem.
+
You can wait the official PR and use the next version.
+
Refer to this PR to modfiy the systemd config by yourself, and you can use systemctl reload the systemd config if you need.
+
+
+
+
+
+
ceph.conf
+
Modify the ms_type to async+rdma, which tell the ceph use the AsyncMessenger + RDMA as your message type.
+
You can use ms_cluster_type and ms_public_type to indicate the message type for your cluster network or public network.
+
Use the command ibdev2netdev to get your device name and use it for ms_async_rdma_device_name
+
If your want to use the port 2 in your NIC for RDMA, set the ms_async_rdma_port_num to 2.
+
You can also use ms_async_rdma_buffer_size, ms_async_rdma_send_buffers and ms_async_rdma_receive_buffers to set the memory you want to allocate for RDMA.
+
ms_async_rdma_send_buffers and ms_async_rdma_receive_buffers are how many work requestes for RDMA send/receive queue respectively.
+
ms_async_rdma_buffer_size is the size os a single registered buffer.
+
the total memory we allocate for each application is ms_async_rdma_buffer_size * (ms_async_rdma_send_buffers + ms_async_rdma_receive_buffers) and you can refer to here to know more about it.
Update the ceph.conf for each node and restart all daemons, after that, the ceph cluster will use the RDMA for all public/cluster network. If you want ot make sure the RDMA works, you can use the following method to dump the RDMA packet and use the wireshark to open it.
1 2 3
1. echo "options mlx4_core log_num_mgm_entry_size=-1" || sudo -a tee /etc/modprobe.d/mlx4.conf 2. sudo /etc/init.d/openibd restart 3. ibdump
apiVersion:certmanager.k8s.io/v1alpha1 kind:ClusterIssuer metadata: name:cert-demo namespace:default spec: acme: #server: https://acme-v02.api.letsencrypt.org/directory server:https://acme-staging-v02.api.letsencrypt.org/directory email:your@mail.com # Name of a secret used to store the ACME account private key privateKeySecretRef: name:cert-demo # ACME DNS-01 provider configurations dns01: # Here we define a list of DNS-01 providers that can solve DNS challenges providers: - name:cf-dns cloudflare: email:your@mail.com # A secretKeyRef to a cloudflare api key apiKeySecretRef: name:cloudflare key:api
..... Status: Acme: Uri: https://acme-staging-v02.api.letsencrypt.org/acme/acct/7037688 Conditions: Last Transition Time: 2018-09-30T16:51:03Z Message: The ACME account was registered with the ACME server Reason: ACMEAccountRegistered Status: True Type: Ready Events: <none>
+
確認這邊是 True/Ready 之後, 對於 Issuers 的事情就告一段落了
+
Certificates
正式進入 Yaml 之前,有幾個重要的事情要先注意
+
+
每個 Certificate 可以設定 Subject Alternative Names(SANs), 亦即可以設定多個 domain name.
+
針對設定的所有 domain name, Issuers 都必須要去處理。實際上在 Issuers 的設定中,可以設定多種方法,甚至是同時支援多個 DNS 供應商。
Events: Type Reason Age From Message ---- ------ ---- ---- ------- Normal CreateOrder 16s cert-manager Created new ACME order, attempting validation... Normal IssueCert 15s cert-manager Issuing certificate... Normal CertObtained 13s cert-manager Obtained certificate from ACME server Normal CertIssued 13s cert-manager Certificate issued successfully
public class Student { public Dictionary<string,string> name {get;set;} public string birthday { get; set; } public string studentID { get; set; } public List<string> email {get;set;}
} Student student = new Student { name = new Dictionary<string,string> { {"firstName","Hung-Wei"}, {"lastName","Chiu"} }, birthday = "19900317", studentID = "0156521", email = new List<string> { "sppsorrg@gmail.com", "hwchiu@cs.nctu.edu.tw" } }; string a = JsonConvert.SerializeObject(student, Newtonsoft.Json.Formatting.Indented); Console.WriteLine(a);
XmlDocument xml = new XmlDocument(); xml.Load("student.xml"); XmlNodeList nodeList = xml.GetElementsByTagName("Student"); foreach (XmlNode parentNode in nodeList) { if (parentNode is XmlElement) { XmlElement element = (XmlElement)parentNode; String id = element.GetAttribute("StudentID"); XmlNodeList childList = element.ChildNodes;
XElement root = XElement.Load("student.xml"); var student = from el in root.Elements("Student") where (string)el.Attribute("StudentID") == "156521" select el; foreach (XElement el in student) { foreach (XNode node in el.Nodes()) { Console.WriteLine(node); } }
Module build was successful.
+=======================================================================
+ With DRBD module version 8.4.5, we split out the management tools
+ into their own repository at http://git.linbit.com/drbd-utils.git
+ (tarball at http://links.linbit.com/drbd-download)
+
+ That started out as "drbd-utils version 8.9.0",
+ and provides compatible drbdadm, drbdsetup and drbdmeta tools
+ for DRBD module versions 8.3, 8.4 and 9.
+
+ Again: to manage DRBD 9 kernel modules and above,
+ you want drbd-utils >= 8.9.11 from above url.
+=======================================================================
+
git clone http://git.drbd.org/drbd-9.0.git cd drbd-9.0 make make install
+
drbd-utils
此 project 提供 drbd user space 的所有工具,包含了 drbdadm, drbdsetup等常用工具。 基本上流程也是滿順利的
+
+
clone git project
+
autogen
+
configure
+
build
+
install
+
+
透過 autogen.sh 產生好對應的 configure 檔案時,會有下列文字說明
+
suggested configure parameters:
+# prepare for rpmbuild, only generate spec files
+./configure --enable-spec
+# or prepare for direct build
+./configure --prefix=/usr --localstatedir=/var --sysconfdir=/etc
+
You want me to create a v09 style flexible-size internal meta data block.
+There appears to be a v09 flexible-size internal meta data block
+already in place on /dev/nvme0n1 at byte offset 400088453120
+
+Do you really want to overwrite the existing meta-data?
+[need to type 'yes' to confirm] yes
+
+initializing activity log
+initializing bitmap (11924 KB) to all zero
+Writing meta data...
+New drbd meta data block successfully created.
+
# Client publicclassClient{ public Point point; public ArrayList<Item> itemList; public ArrayList<Skill> skillList; publicClient(String name){ PersonManager pm = new PersonManager(); PersonManager.loadData(name,point,itemList,skillList); } }
本文基於 SDN Controller Floodlight 的原始碼進行了一次簡單的分析,藉由分析這些原始碼更可以瞭解每個開放出來的 Restful API 該怎麼使用。相對於文件的更新速度,程式碼本身的迭代速度更為敏捷,因此常常會發生文件跟不上實際運行功能的案例。藉由學習閱讀原始碼,我們可以更快也更清楚的掌握當前這些開源軟體的發展狀態,甚至也能夠貢獻社群幫忙補齊文件。
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Preface
Floodlight Openflow Controller 預設就有Rest Server 並且提供對應的Rest API供使用者呼叫使用
+
再core module這邊,目前提供了8種restAPI使用
+
+
/wm/core/switch/all/$statType/json
+
/wm/core/swtich/$switchi>/$statType$/json
+
/wm/core/controller/switches/json
+
/wm/core/role/json
+
/wm/core/counter/$counterTitle/json
+
/wm/core/counter/$switchId/$counterName$/json
+
/wm/core/memory/json
+
/wm/core/module/{all}/json
+
+
第一篇主要講前面兩個,關於switch information方面。
+
Usage
這兩個RestAPI做的事情就是對switch 發送 Openflow status request的封包去詢問相關的訊息 發送的種類就是 statType。以下是目前的類型
for (int iSleepCycles = 0; iSleepCycles < 12; iSleepCycles++) { for (GetConcurrentStatsThread curThread : activeThreads) { if (curThread.getState() == State.TERMINATED) { if (rType == REQUESTTYPE.OFSTATS) { model.put(HexString.toHexString(curThread.getSwitchId()), curThread.getStatisticsReply()); } elseif (rType == REQUESTTYPE.OFFEATURES) { model.put(HexString.toHexString(curThread.getSwitchId()), curThread.getFeaturesReply()); } pendingRemovalThreads.add(curThread); } }
// remove the threads that have completed the queries to the switches for (GetConcurrentStatsThread curThread : pendingRemovalThreads) { activeThreads.remove(curThread); } // clear the list so we don't try to double remove them pendingRemovalThreads.clear();
// if we are done finish early so we don't always get the worst case if (activeThreads.isEmpty()) { break; }
// sleep for 1 s here try { Thread.sleep(1000); } catch (InterruptedException e) { log.error("Interrupted while waiting for statistics", e); } }
for (SwitchPort dstDap : dstDevice.getAttachmentPoints()) { Long dstSwDpid = dstDap.getSwitchDPID(); Long dstIsland = topology.getL2DomainId(dstSwDpid);`
if ((dstIsland != null) && dstIsland.equals(srcIsland)) on_same_island = true;
HashMap<Long, Link> nexthoplinks = new HashMap<Long, Link>(); HashMap<Long, Integer> cost = new HashMap<Long, Integer>(); int w; for (Long node: c.links.keySet()) { nexthoplinks.put(node, null); cost.put(node, MAX_PATH_WEIGHT); }
HashMap<Long, Boolean> seen = new HashMap<Long, Boolean>(); PriorityQueue<NodeDist> nodeq = new PriorityQueue<NodeDist>(); nodeq.add(new NodeDist(root, 0)); cost.put(root, 0);
+
Step 2
+
從queue裡面拿出cost最小的node
+
取得到達該node的cost
+
做個錯誤檢查
+
如果該node已經檢查過了,就忽略。
+
把該node加入到seen裡面
+
+
1 2 3 4 5 6 7
while (nodeq.peek() != null) { NodeDist n = nodeq.poll(); Long cnode = n.getNode(); int cdist = n.getDist(); if (cdist >= MAX_PATH_WEIGHT) break; if (seen.containsKey(cnode)) continue; seen.put(cnode, true);
+
Step 3
+
取得該node連接的所有link 每條link都會存放兩次,src跟destnation會相反
+
根據 isDstRooted,每條link都只取src or dest (因為每條link會出現兩次,所以switch一定不會漏掉)
for (Link link: c.links.get(cnode)) { Long neighbor;
if (isDstRooted == true) neighbor = link.getSrc(); else neighbor = link.getDst(); // links directed toward cnode will result in this condition if (neighbor.equals(cnode)) continue;
if (seen.containsKey(neighbor)) continue;
if (linkCost == null || linkCost.get(link)==null) w = 1; else w = linkCost.get(link);
int ndist = cdist + w; // the weight of the link, always 1 in current version of floodlight. if (ndist < cost.get(neighbor)) { cost.put(neighbor, ndist); nexthoplinks.put(neighbor, link); log.info("neibhbor = {}",neighbor.toString()); //nexthopnodes.put(neighbor, cnode); NodeDist ndTemp = new NodeDist(neighbor, ndist); // Remove an object that's already in there. // Note that the comparison is based on only the node id, // and not node id and distance. nodeq.remove(ndTemp); // add the current object to the queue. } } }
In GKE, from 1.9 onwards, this DaemonSet is automatically deployed as an addon. Note that DaemonSet pods are only scheduled on nodes with accelerators attached, they are not scheduled on nodes that don’t have any accelerators attached.
CUDA® is NVIDIA's parallel computing platform and programming model for GPUs. The NVIDIA device drivers you install in your cluster include the CUDA libraries.
CUDA libraries and debug utilities are made available inside the container at /usr/local/nvidia/lib64 and /usr/local/nvidia/bin, respectively.
CUDA applications running in Pods consuming NVIDIA GPUs need to dynamically discover CUDA libraries. This requires including /usr/local/nvidia/lib64 in the LD_LIBRARY_PATH environment variable.
+
首先,你安裝 NVIDIA Driver 的同時, 也會一起安裝 CUDA 的函式庫 再來 CUDA 相關的函式庫與使用工具都可以在 Container 內的 /usr/local/nvidia/lib64 以及 /usr/local/nvidia/bin 找到與使用。 最後你必須要設定你應用程式的 LD_LIBRARY_PATH 來確保你的應用程式在尋找連結庫的時候不要忘記去找 /usr/local/nvidia/lib64.
+ COS_DOWNLOAD_GCS=https://storage.googleapis.com/cos-tools + COS_KERNEL_SRC_GIT=https://chromium.googlesource.com/chromiumos/third_party/kernel + COS_KERNEL_SRC_ARCHIVE=kernel-src.tar.gz + TOOLCHAIN_URL_FILENAME=toolchain_url + CHROMIUMOS_SDK_GCS=https://storage.googleapis.com/chromiumos-sdk + ROOT_OS_RELEASE=/root/etc/os-release + KERNEL_SRC_DIR=/build/usr/src/linux + NVIDIA_DRIVER_VERSION=384.111 + NVIDIA_DRIVER_DOWNLOAD_URL_DEFAULT=https://us.download.nvidia.com/tesla/384.111/NVIDIA-Linux-x86_64-384.111.run + NVIDIA_DRIVER_DOWNLOAD_URL=https://us.download.nvidia.com/tesla/384.111/NVIDIA-Linux-x86_64-384.111.run + NVIDIA_INSTALL_DIR_HOST=/home/kubernetes/bin/nvidia + NVIDIA_INSTALL_DIR_CONTAINER=/usr/local/nvidia ++ basename https://us.download.nvidia.com/tesla/384.111/NVIDIA-Linux-x86_64-384.111.run + NVIDIA_INSTALLER_RUNFILE=NVIDIA-Linux-x86_64-384.111.run + ROOT_MOUNT_DIR=/root + CACHE_FILE=/usr/local/nvidia/.cache + set +x [INFO 2018-07-14 14:57:09 UTC] Running on COS build id 10323.85.0 [INFO 2018-07-14 14:57:09 UTC] Checking if third party kernel modules can be installed [INFO 2018-07-14 14:57:09 UTC] Checking cached version [INFO 2018-07-14 14:57:09 UTC] Found existing driver installation for image version 10323.85.0 and driver version 384.111. [INFO 2018-07-14 14:57:09 UTC] Configuring cached driver installation [INFO 2018-07-14 14:57:09 UTC] Updating container's ld cache [INFO 2018-07-14 14:57:14 UTC] Verifying Nvidia installation Sat Jul 14 14:57:15 2018 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 384.111 Driver Version: 384.111 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | |===============================+======================+======================| | 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 | | N/A 31C P0 62W / 149W | 0MiB / 11439MiB | 100% Default | +-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+ | Processes: GPU Memory | | GPU PID Type Process name Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+ [INFO 2018-07-14 14:57:16 UTC] Found cached version, NOT building the drivers. [INFO 2018-07-14 14:57:16 UTC] Updating host's ld cache
+
+
+
+
+
+
+ NCurses Disk Usage(ncdu) is a powerful tool to view file sizes across different directories in a simple and friendly GUI. Besides, you can also do some operation but read, such as delete file/directory. In this post, I will introduce what is ncdu and how to use it to replace the legacy command du.
+
+
+
接下來為了讓外界的服務可以存取到該 Nginx Server (Ingress-Server),這邊會根據你的機器環境而有所不同。 以我 Baremetal 的環境,我需要部署下列的資源,透過 Kubernetes Service NodePort 的方式讓我的 Nginx Server 可以被外界存取
kubectl -n default get all NAME READY STATUS RESTARTS AGE pod/jupyter 1/1 Running 0 15h pod/nginx-7dd9f89db4-tvfkk 1/1 Running 0 15h pod/nginx-v2-5c45597f57-746p8 1/1 Running 0 15h
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE service/jupyter LoadBalancer 10.106.190.88 <pending> 80:32444/TCP 15h service/kubernetes ClusterIP 10.96.0.1 <none> 443/TCP 89d service/nginx ClusterIP 10.110.237.87 <none> 80/TCP 15h service/nginx-v2 ClusterIP 10.103.43.44 <none> 80/TCP 15h
NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE deployment.apps/nginx 1 1 1 1 15h deployment.apps/nginx-v2 1 1 1 1 15h
NAME DESIRED CURRENT READY AGE replicaset.apps/nginx-7dd9f89db4 1 1 1 15h replicaset.apps/nginx-v2-5c45597f57 1 1 1 15h
+
上述服務完畢後,先用 curl 針對三個 service 的 ClusterIP 去確認服務有正常起來
因為我們的 Nginx Server 是基於 NorePort 的方式來供對外存取,所以我們要先確認一下開啟的NodePort資訊是什麼
1 2 3
$ kubectl -n ingress-nginx get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE ingress-nginx NodePort 10.111.134.97 <none> 80:32663/TCP,443:31309/TCP 1d
本文要介紹如何在 FreeBSD 的環境下,如果遇到空間不夠,然後透過 VM Manager 的方式去擴充一顆硬碟空間時,要如何將該硬碟空間跟本來的硬碟空間給合併成一個更大的儲存空間來使用。這個情境我個人還滿長遇到的,因為有時候透過 VM 去創立系統時,一開始沒有想到可能會使用到的空間大小,結果使用後硬碟馬上就空間不足了。雖然可以透過 VM 的管理方式擴充舊有的硬碟空間大小。本文針對這部分筆記一下使用的指令以及概念。
這邊整理一下安裝 news servre on FreeBSD 9.1 時遇到的一些問題,並且筆記一些操作
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
文章轉移
+
rsync cycbuff
+
rsync db/history
+
重新建立overview
+
ctlinnd pause ‘make overview’
+
makehistory -x -O -b x: won’t write out history file entries. O: Create the overview database b: Delete any messages found in the spool that do not have valid Message-ID: headers in them.
The most import feature of the container is the resource isolation, including the mount, network, user, UTC and PID. that's the reason why we can't see those resources status of the host. The resources isolation are supported by the Linux Kernel and we will demostrate the networking part by the network namespace and also show you how does the docekr use the network namespace and Linux Bridge to proivde the network connectivity for each container.
If you have any experience about setuping a kubernetes cluster before, you must notice that you need to choose one CNI in your kubernetes cluster, and there’re many candidate that you can choose, including the flannel, weave, calico and so on.
+
Most of the kubernetes users and operators don’t know what is the different between those CNI plgins and the only thing they care is that the CNI can make the network works well.
+
So, I will introduce the Container Network Interface (CNI) in the following articles.
+
+
First, I will explain what is the bridge network in docekr and hot it works. Besides, I also introduce the Linux Network Namespace (ns) and use the Linux Network Namespace to create a simple environment.
+
Second, We have the basic knowhow about network namespace and we can start to learn what is CNI, why we need the CNI and how CNI works. we also use the simple CNI to demostrate how CNI works with network namespace.
+
Third, We have learned what is the CNI before, and we will start to implement our own CNI which is a simple CNI just like the bridge network (the default network of docker). That article will be a tutorial about how to write a CNI in golang
+
+
Introduction
We all know that the docker is very easy to use and we can setup any server we want in one command docker run
+
For example, If I want to run a busybox, I can use the docker run busybox to run a busybox container in my environment.
+
The more complicated example is the we can run a simple nginx server with the docekr run and we can see the example in the nginxdocker hub repo. Just type the following command in your docker-ready environment.
1
$ docker run --name some-nginx -d -p 8080:80 some-content-nginx
+
You will run a nginx server which listens on its network with port 80 and you can connect to the nginx server with http://localhost:8080 or http://containerIP:80
+
Now, type the following again.
1
$ docker run --name some-nginx -d -p 8081:80 some-content-nginx2
+
We will run another nginx server which listens to its network with port 80 and you can connect to it with http://localhost:8081
+
There is one question, How does the docker do that? why can we run two nginx server listening to 80 port in the same time?
+
If you have any experience about writing thesocket programming, you must know that we can’t bind/listen the same tuple(IP,TCP/UDP,Port) in two processes.
+
We need to choose difference port for each process and that’s why there’re so many well-known port numbers, such as 22,80,443 and we should avoid to use those ports in our appliction.
+
The reason why we can do it in the docker is Linux Network Namespace.
+
The magic how the docker do that is via the Linux Network Namespace. In the linux kernel, each network namespace has its own network configuration, including the network interfaces, routing tables, netfilters and we can learn more about in this website.
+
So, when we run a docker container, the system will create a new network namespace and put it inside the docekr container. In our previous example, the system will create two network namepsace when we run two nginx docker container and each container has its own network stack.
+
Implementation
Now, we will learn why we can use the http://localhost:8080 to access the nginx container in the follwing tutorials. Besides, we will operates the network namespace and linux bridge to simulate what docker do when we create a docker container.
+
Linux Bridge
In the default behavior, the docker will create a linux bridge docker0 when you install the docker.io/docker.ce into your system. and it will handle the network connectivity for every docker container (use the –net=bridge and it is docker default option) You can use the following command to see the linux bridge after you install the docker package.
+
We can create our own linux bridge via the brctl command and you can get it by installing the bridge-utils package.
1
$ apt-get install bridge-utils
+
Create our own linux bridge and assign a IP address to it.
If you have installed the docker package, you can see there’s a interface docker0 in the system and it’s IP address is 172.17.0.0/16. If that, you should change your br0 IP address to other CIDR subnet.
+
1 2 3 4 5 6 7 8 9 10 11
$ brctl show bridge name bridge id STP enabled interfaces docker0 8000.0242b8582904 no $ ifconfig docker0 docker0 Link encap:Ethernet HWaddr 02:42:b8:58:29:04 inet addr:172.17.0.1 Bcast:0.0.0.0 Mask:255.255.0.0 UP BROADCAST MULTICAST MTU:1500 Metric:1 RX packets:0 errors:0 dropped:0 overruns:0 frame:0 TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:0 RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
+
We can use the following figure to show the system view of the system now. The default ip address of the docker0 is 172.17.0.0/16 and it can be configured via the docker config.
+
We won’t discuss what is layer2 bridging here, the only thing we need to know is that docker will use this bridge to forward the packets between hosts and containers.
+
Network Namespace
Now, what will happen when we create a docker container?
1
$$ docker run --name some-nginx -d -p 8080:80 some-content-nginx
+
First, the docker will create a docker container and also create a network namespace indise that container. The whole system looks like below figure. there’re a linux bridge (docekr0) and a docker container (nginx).
+
In our example, we won’t use the docker but network namespace, so we can create a network namepsace here.
1
$ ip netns add ns1
+
Up to now, the container(network namespace) doesn’t have the network connectivity which measn any process inside that contaner can’t setup a network connection with outside.
+
Veth
In order to make the docker container nginx/netowkr namespace has the network connectivity, we need to connect two network namespaces togehter first. the linux host and the docekr container. since the network namespace is a logical concept in the linux system, we can use another linux technology veth to help us. The veth is represent to a virtual link and it can connect to two different network namespace, each veth pair is made up by two virtual network interface For example, type the following command to create a veth pair.
1 2 3 4 5 6
$ sudo ip link add ve_A type veth peer name ve_B $ ip link 15: ve_B@ve_A: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether be:8f:26:d9:22:50 brd ff:ff:ff:ff:ff:ff 16: ve_A@ve_B: <BROADCAST,MULTICAST,M-DOWN> mtu 1500 qdisc noop state DOWN mode DEFAULT group default qlen 1000 link/ether a2:9b:75:06:51:30 brd ff:ff:ff:ff:ff:ff
+
In the above example, we create a veth pair and the virtual network interface of it is ve_A and ve_B. you can use the some network utils to see them, such as ip link, ifconfig.
+
The system view loooks like beflow, we have a veth pair now but two sides of the veth pair still in the same network namespace.
+
+
Next, we need to move one side of the veth pair into the docker container, specifically, is the network namespace.
+
Just like we say before, the veth pair is used to connect two network namespace. we can do that via the ip command.
1 2
$ sudo ip link set ve_B netns ns1 $ sudo ip netns exec ns1 ip link set ve_B name eth0
+
Now, the ve_B is moved into the network namespace ns1 and rename as eth1, we can execute commands in the networl namespace to list the interface.
1 2 3 4 5 6 7
$ sudo ip netns exec ns1 ifconfig -a eth0 ink encap:Ethernet HWaddr be:8f:26:d9:22:50 BROADCAST MULTICAST MTU:1500 Metric:1 RX packets:0 errors:0 dropped:0 overruns:0 frame:0 TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1000 RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
+
and you should see the interface eth1with any IP configuration.
+
At last, we need to attach another side of veth pair into the linux bridge docker0, just use the brctl command.
1
brctkl addif docker0 ve_A
+
Good, We have setup differentes network namespace and connect it via the veth and linux bridge.
+
ip management
The next thing we need to handle it to assign an IP addess to the docekr container/network namespace. Just like above, use the ip netns exec ns1 ifconfig eth1 xxxxxx netmask xxxxx to set the ip address to the interface eth1.
+
The problem is how do we decide what IP address we use?
+
Since we use the linux bridge for layer2 forwarding, we sholud put all the docker container/network namespace and bridge in the same subnet. Which means we should choose any IP address from 172.17.0.0/16. How to choose the IP address is designed by docker and you. You should avoid to use the duplicate IP address since it will cause the ARP problem. After choosing the IP address, set to the interface in the docker continer/network namespace
After that, you can repeat above example to create more network namespace with different IP address and try to use the command ping to test the network connectivity in the layer 2 network.
+
iptables
The last one we need to understand is iptables, and it’s a optional step. For a docker container, if we want to access the container from outside network, we should use the -p flag to indicate the port mapping in the docker run command.
+
For example, when we use the following command to create a docker container.
1
$ docker run --name some-nginx -d -p 8080:80 some-content-nginx
+
It will also insert some rules into the iptables and those rules will do
+
+
if the destination port number of a packet is 8080, forward it to the container some-content-nginx.
+
modify the destination ip to the ip address of container some-content-nginx
+
modify the destination port number from 8080 to 80
+
+
+
+
But if we don’t need to access it from outside? we don’t the iptables rules to do that. that why I mean it’s a optional step.
+
Summary
Accoding to the above example, we know that the docker network is based on the linux network namespace.
+
What will happen when we run a docker container?
+
+
setup a linux bridge (usually be created when you install docker)
+
create a network namespace
+
create a veth pair (virutal ether link)
+
attach the veth pair to target network namespace.
+
find a unique IP address and assign to the taget network namespace.
+
setup the iptables rules if you want to access it from outside.
+
+
In the next posts, I will talk about what is CNI and why we need CNI and how CNI works.
Container Network Interface (CNI) as a Network Interface between the network soluition and the container mechanism. Without the CNI, the network solution developer should implement his/her plugin for every container environment and it must be a disaster. Fortunately, with the help of the CNI, the developer can only focus on one interface and it should work for every container mechanism. In this post, we will see why we need the CNI, what is CNI and how kubernetes use the CNI to provide the network connectiviy for the computing unit, so called Pod.
In this post, I will try to introduce the concept of Container Network Interface (CNI), including why we need this, how it works and what does it do.
+
If you have not familiar with what is linux network namespace and how docker handles the network for its containers. You should read the [CNI] Bridge Network In Docker to learn those concepts and that will be helpful for this tutorial.
+
Introduction
Why We Need CNI
In the previous post, we have learn the procedure of the basic bridge network in the docker.
+
+
Create a Linux Bridge
+
Create a Network Namespace
+
Create a Veth Pair
+
Connect the bridge and network namespace with veth pair
+
Setup the IP address to the network namespace
+
Setup the iptalbes rules for exporting the services (optional)
+
+
However, That’s the bridge network and it only provide the layer2 forwarding. For some use cases, it’s not enough. More and more requirement, such as layer3 routing, overlay network, high performance , openvswitch and so on.
+
From the docker point of view, it’s impossible to implement and maintain all above requirements by them.
+
The better solution is to open its interface and make everyone can write its own network service and that’s how docker network works.
+
So, there’re so many plugins for the docker network now and every can choose what kind of the network they want.
+
Unfortunately, docker isn’t the only container technical, there’re otehr competitors, such as rkt, lxc. Besides, more and more container cluster orchestration, docker swam, mesos, kubernetes and so on.
+
Take a bridge network as an example, do we need to implement the bridge network for all container orchestration/solutions? do we need to write many duplicate code because of the not-unified interface between each orchestrator?
+
That’s why we need the Container Network Interface(CNI), The Container Network Interface(CNI) is a Cloud Native Computing Foundation projects, we can see more information here.
+
With the CNI, we have a unified interface for network services and we should only implement our network plugin once, and it should works everywhere which support the CNI.
+
According to the official website’s report. those container runtimes solutions all supports the CNI
Container Network Interface is a specifiction which defined what kind of the interface you should implement.
+
In order to make it easy for developers to deveploe its own CNI plugin. the Container Network Interface project also provides many library for developing and all of it is based on the golang language.
In CNI specifiction, there’re three method we need to implement for our own plugin.
+
+
ADD
+
DELETE
+
VERSION
+
+
ADD will be invoked when the container has been created. The plugin should prepare resources and make sure that container with network connectivity. DEKETE will be inboked when the container has been destroyed. The plugin should remove all allocated reousrces. VERSION shows the version of this CNI plugin.
+
For each method, the CNI interface will pass the following information into your plugin
+
+
ContainerID
+
Netns
+
IfName
+
Args
+
Path
+
StdinData
+
+
I will explain those fields detaily in the next tutorial. In here, we just need to know for the CNI plugin, we sholud use those information ContainerID, Network Namespace path and Interface Name and StdinData to make the container with network connectivity.
+
Use the previous bridge-network as example. the network namespace will be created by the orchestrator and it will pass the path of that network namespace via the variable netns to CNI. After we crete the veth pair and connect to the network namespace, we should set the interface name to Ifname.
+
For the IPAM (IP Adderss Management), we can get the information from the StdinData and calculate what IP address we should use in the CNI plugin.
+
Kubernetes
Now, We will see how kubernetes use CNI to create a network function for Pods.
+
Configuration
In order to use the CNI, we need to config the kubelet to use the CNI method. There’re three argurments we need to take care.
+
+
cni-bin-dir: the directory of CNI binaries.
+
cni-conf-dir: the directory of CNI config files, common CNI(flannel/calico..etc) will install its config into here.
+
network-plugin: the type of network-plugin for Pods.
In the cni-conf-dir, we should put the CNI config here and kubernetes will use the config for your Pod. In my kubernetes cluster, I had installed the flannel CNI in it and the flannel will install its config here.
When kubelet receives a request to create a Pod in the node. First, it will search the cni-conf-dir in the alphabet order and inspect it.
+
Take the 10-falnnel.conf as example. when the kubelet knows the type is flannel, it will try to call the flannel in the cni-bin-dir and that’s /opt/cni/bin/flannel.
Before kuberlet create the Pod, it will create a pause conatiner first. And follows the CNI steps to setup the network fot that pause container.(Assueme we use the network-plugin=cni)
+
Now, The pause container is running and has the network connectivity. The kubelet will create containers which is be described in the yaml file and attach those container to that pause container (in the docker command, we can use the –net=$containerID to do the same thing).
+
By those procedure, we can maks sure all containers share the same network stack and any container crash won’t destory the network stack since the network stack is hold sy the pause container.
+
Combine the pause container and user containers, it’s called Pod. And you can try to use the docker ps in your kubernetes node to see how many pause container in there.
+
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
vortex-dev:05:19:30 [~]vagrant $sudo docker ps -a | grep pause 8838b9614a30 k8s.gcr.io/pause:3.1 "/pause" 7 hours ago Up 7 hours k8s_POD_nfs-provisioner-5b75397b4807c54ad4fe92e2-6954c749cc-cn5jh_vortex_9f2f692c-a130-11e8-9450-02ddf6cab53d_0 0a232459f786 k8s.gcr.io/pause:3.1 "/pause" 7 hours ago Up 7 hours k8s_POD_vortex-server-58895cd7c6-xvd8g_vortex_7d88347b-9f9a-11e8-8719-02ddf6cab53d_8 b0ca4ca2405d k8s.gcr.io/pause:3.1 "/pause" 7 hours ago Up 7 hours k8s_POD_kube-state-metrics-7d7d7b6bbc-fsf7b_vortex_7d83db65-9f9a-11e8-8719-02ddf6cab53d_7 63a1f3b8a35f k8s.gcr.io/pause:3.1 "/pause" 7 hours ago Up 7 hours k8s_POD_coredns-78fcdf6894-s8ts5_kube-system_c9ef514c-9a23-11e8-9c21-02ddf6cab53d_9 310b7a6daa54 k8s.gcr.io/pause:3.1 "/pause" 7 hours ago Up 7 hours k8s_POD_cadvisor-zk8bk_vortex_7d726ff5-9f9a-11e8-8719-02ddf6cab53d_3 3f0141a5a9b6 k8s.gcr.io/pause:3.1 "/pause" 7 hours ago Up 7 hours k8s_POD_network-controller-server-tcp-nnvgk_vortex_7d648d43-9f9a-11e8-8719-02ddf6cab53d_2 9cedcb482e69 k8s.gcr.io/pause:3.1 "/pause" 7 hours ago Up 7 hours
+
Summary
The Container Network Interface CNI made the network-service developer more easy to develop their own network plguin. They don’t need to write duplicate code for different system/orchestrator. Just write once and run everywhere.
+
And the CNI consists of a specification and many userful libraries for developers. The CNI only care the ADD and DELETE events. the CNI plugin shoould make sure the container with network connectivity when the ADD event has been triggered and remove all allocted resources when the DELETE event has been triggered.
+
In the next tutorial, I will show how to write a simple bridge CNI plugin in golang.
As we know, the kubernetes use the CNI to provide the network connectivity for its Pod unit and the cluster administrator can choose what kind of the CNI should be installed in the cluster. For example, if the only requirement is the overlay network, you can choose the flannel CNI and choose the calico CNI if you have the requirement of the BGP. In this post, we will learn how to write your own CNI in golang language. Actually, You can implement it with any language as you like.
In this post, I will show how to write your own CNI program.
+
Container Network Interface(CNI) can be implemented by any programming languages as you like.
+
You just to follow the interface and your program can be used for every infrastructure using the CNI for their network connectivity.
+
In this tutorial, I will use the golang to implement a simple CNI witch create a Linux Bridge in the host and connect the container and the host itself.
In order to help the develop to develop their own CNI, the CNCF had setup two projects for developers.
+
Those projects are based on the golang language and provide useful libraries for the developer to control the Linux network functions, such as IP, netlink and network namespace.
+
The ContainerNetworking/CNI provides the basic function for CNI implementation in golang and you can see the introduction of that project in its README
+
+
As well as the specification, this repository contains the Go source code of a library for integrating CNI into applications and an example command-line tool for executing CNI plugins. A separate repository contains reference plugins and a template for making new plugins.
+
+
The other project ContainerNetwokring/Plugins provides some basic network functions for your CNI and it can be divided into two types.
+
Basic CNI
It provides some basic CNI, such as Bridge, MacVlan, Host Device.. And so on. You can chain those CNI into your own CNI and combine those into a more powerful CNI.
+
IPAM
IPAM (IP Address Management) provides some method to handle the IP/Route management. It provides host-local, dhcp and static three methods now.
+
In the host-local, you just need to provide a configuration file to describe what subnet/gateway you want to use and it will allocate a unused IP address from that subnet for your CNI. And the dhcp will runs a DHCP client in each container and send a dhcp request to get a IP address from the dhcp server.
+
In this tutorial, we will implement a bridge CNI and explain those functions step by step.
+
Before We Start
Before we start to implement the CNI, we must know the interface/specification of the CNI.
+
+
Your CNI will be invoked when the container is ready to create or has been terminated.
+
+
+
Allocate resources for the container, including the IP address and the network connectivity.
+
Remove all resources you allocated before when a container has been terminated.
+
+
+
The caller will pass the following information into your CNI program
+
+
+
Command (What kind of the event you should care)
+
ADD
+
DELETE
+
VERSION
+
+
+
ContainerID (The target ContainerID)
+
NetNS (THe network namespace path of the container)
+
IFNAME (The interface name should be created in the container)
+
PATH (The current working PATH, you should use it to execute other CNI)
+
STDIN (The configuration file of your CNI)
+
+
Step By Step
For each step, you can find a corresponding folder in my github repo and there’s all golang files for each steps.
+
Step1
First, we need to provide two function for ADD and DELETE event which is used to allocate/recycle resource when the container has been start/terminated.
Use the go build to build the binary and assume our execution file is example and then we should provide a basic configuration which should contains useful information for our CNI. Maybe we call the file configuration its contents looks like
br := &netlink.Bridge{ LinkAttrs: netlink.LinkAttrs{ Name: sb.BridgeName, MTU: 1500, // Let kernel use default txqueuelen; leaving it unset // means 0, and a zero-length TX queue messes up FIFO // traffic shapers which use TX queue length as the // default packet limit TxQLen: -1, }, }
br := &netlink.Bridge{ LinkAttrs: netlink.LinkAttrs{ Name: sb.BridgeName, MTU: 1500, // Let kernel use default txqueuelen; leaving it unset // means 0, and a zero-length TX queue messes up FIFO // traffic shapers which use TX queue length as the // default packet limit TxQLen: -1, }, }
Use the aforementioned command to call the binary again and you should see the linux bridge test has been created.
+
If youu don’t have the brctl command, use the apt-get install bridge-utils to to install the bridge tools.
+
Step3
In the next step, we will creat a veth for connecting the linux bridge and the taget container.
+
The logical flow are
+
+
Get the bridge object from the Bridge we created before
+
Get the namespace of the container
+
Create a veth on the container and move the host-end veth to host ns.
+
Attach a host-end veth to linux bridge
+
+
This step is more complicate then previous steps. since we will handle the network namespace here. Fortunately, the CNI project has provided convenience function to handle the veth and it can cover the (3) action itom above.
+
First, we use the netlink.LinkByName method to lookup the netlink object.
+
1 2 3 4
l, err := netlink.LinkByName(sb.BridgeName) if err != nil { return fmt.Errorf("could not lookup %q: %v", sb.BridgeName, err) }
+
and the we need to make sure that object is netlink.Bridge, so we do the type casting.
1 2 3 4
newBr, ok := l.(*netlink.Bridge) if !ok { return fmt.Errorf("%q already exists but is not a bridge", sb.BridgeName) }
+
Second, since the CmdArgs already provide the network namespace path of the container, we can use the method from the ns package to load the object of the network namespace.
For each NetNS object, it implement a function Do which take a function as its parameter and that function’s parameter is the caller’s network namespace.
+
The do function will switch the network namespace to NetNS object itself and call the function(parameter) and feed the original network namespace as parameter.
+
See the following example to learn more about do function.
First , we create a function handler which calls the ip.SetpuVeth to create a veth pair on caller’s network namespace and move one side of veth pair to its third parameter(hostNS)
+
When we call the netns.Do(handler), it will call the function handler in netns's network namepsace and pass the caller’s network namespace to the function handler. Which will result in that there will be a veth pair between the host’s network namespace and netns's netowkr namespace.
+
In order to store the information about that veth pair, we can use the current.Interface{} object to store the data.
Now, we can get the interface name of veth pair in the host side by hostIface.Name and then we will attach that link to the Linux Bridge we created before.
+
+
Get the link object from the interface name by function call netlink.LinkByName
+
Connect the link to bridge by function call netlink.LinkSetMaster
There is one important thing we need to care is the OS thread. since we will switch the netns to handle the namespace things. We must make sure the OS won’t switch the thread during the namespace operations.
+
Use the function runtime.LockOSThread() in the golang predefined function init().
+
1 2 3 4 5 6
funcinit() { // this ensures that main runs only on main thread (thread group leader). // since namespace ops (unshare, setns) are done for a single thread, we // must ensure that the goroutine does not jump from OS thread to thread runtime.LockOSThread() }
Ready to call the step3 example {test 192.0.2.1/24} The CNI has been called, see the following results The bridge and the veth has been attatch to bridge name bridge id STP enabled interfaces test 8000.aa6e12faa09b no vethff65a064 The interface in the netns eth10 Link encap:Ethernet HWaddr 7e:23:e2:e5:8f:c4 inet6 addr: fe80::7c23:e2ff:fee5:8fc4/64 Scope:Link UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 RX packets:1 errors:0 dropped:0 overruns:0 frame:0 TX packets:1 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:0 RX bytes:90 (90.0 B) TX bytes:90 (90.0 B)
lo Link encap:Local Loopback LOOPBACK MTU:65536 Metric:1 RX packets:0 errors:0 dropped:0 overruns:0 frame:0 TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1 RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
+
We have successfully create a linux bridge and connect to the other network namespace via the veth pair and the interface in that namepsace is eth10 which has been defiend in the config file.
+
Step4
In this step, we will setup the IP address into the target network namespace. To make the problem easy, we had set the target IP address in the config and we can get via the sp.IP
+
1 2 3 4
type SimpleBridge struct { BridgeName string`json:"bridgeName"` IP string`json:"ip"` }
+
The function we used to assign the IP address is netlink.AddrAdd So the workflow is
+
+
Generate a IP object from the config.
+
Call the nelink.AddrAdd in the target network namespace.
+
+
The parameter of netlink.AddrAdd is netlink.Addr and see its structure below.
1 2 3 4 5 6 7 8 9 10
type Addr struct { *net.IPNet Label string Flags int Scope int Peer *net.IPNet Broadcast net.IP PreferedLft int ValidLft int }
+
We can use the net package provided by official golang to generate the net.IPNet type and its a CIDR form (IP address and the Mask).
+
Since the IP address in our config is a string192.0.2.15/24, we use the net.ParseCIDR to parse the string and return a pointer of net.IPNet
+
So, modify the previous handler to assign the IP address when we create a veth.
+
Since the net.IPNet object get from the net.ParseCIDR is the subnet not a real IP addrees, we should reassign the IP address to its IP field again.
Ready to call the step4 example {test 192.0.2.15/24} The CNI has been called, see the following results The bridge and the veth has been attatch to bridge name bridge id STP enabled interfaces test 8000.a6f55b2927c0 no vethd611bb3b The interface in the netns eth10 Link encap:Ethernet HWaddr aa:a0:96:45:65:c5 inet addr:192.0.2.15 Bcast:192.0.2.255 Mask:255.255.255.0 inet6 addr: fe80::a8a0:96ff:fe45:65c5/64 Scope:Link UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 RX packets:2 errors:0 dropped:0 overruns:0 frame:0 TX packets:1 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:0 RX bytes:168 (168.0 B) TX bytes:90 (90.0 B)
lo Link encap:Local Loopback LOOPBACK MTU:65536 Metric:1 RX packets:0 errors:0 dropped:0 overruns:0 frame:0 TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1 RX bytes:0 (0.0 B) TX bytes:0 (0.0 B)
+
And you can see we have already set the IP address to the interface eth10. You can use the following command to mamually set the IP address to the linux bridge and use the ping command to check the network connectiviy between the host and the target network namespace.
1 2
sudo ifconfig test 192.0.2.1 sudo ip netns exec ns1 ping 192.0.2.1
+
Summary
In this tutorial, we have implemented a simple Linux Bridge CNI (only Add function) in golang.
+
We create the linux bridge and use the veth to connect the linux bridge with the target netowrk namespace. Besides, we also fethc the information we want from the pre-defined config file which means we can more flexible to change the behavior of your own CNI implementation.
+
To make the problem simple, we don’t use any complicated method to acquire a unique address from the config but you can desing you own algorithm to do that. If you want to learn more about the IP related operations, you can go to the host-local to learn more.
classNbConvertApp(JupyterApp): """Application used to convert from notebook file type (``*.ipynb``)""" version = __version__ name = 'jupyter-nbconvert' aliases = nbconvert_aliases flags = nbconvert_flags ....
+
1 2 3 4
defstart(self): """Run start after initialization process has completed""" super(NbConvertApp, self).start() self.convert_notebooks()
defconvert_notebooks(self): """Convert the notebooks in the self.notebook traitlet """ # check that the output base isn't specified if there is more than # one notebook to convert if self.output_base != ''and len(self.notebooks) > 1: self.log.error( """ UsageError: --output flag or `NbConvertApp.output_base` config option cannot be used when converting multiple notebooks. """ ) self.exit(1) # initialize the exporter cls = get_exporter(self.export_format) self.exporter = cls(config=self.config)
# no notebooks to convert! if len(self.notebooks) == 0andnot self.from_stdin: self.print_help() sys.exit(-1)
# convert each notebook ifnot self.from_stdin: for notebook_filename in self.notebooks: self.convert_single_notebook(notebook_filename) else: input_buffer = unicode_stdin_stream() # default name when conversion from stdin self.convert_single_notebook("notebook.ipynb", input_buffer=input_buffer)
defconvert_single_notebook(self, notebook_filename, input_buffer=None): """Convert a single notebook. Performs the following steps: 1. Initialize notebook resources 2. Export the notebook to a particular format 3. Write the exported notebook to file 4. (Maybe) postprocess the written file Parameters ---------- notebook_filename : str input_buffer : If input_buffer is not None, conversion is done and the buffer is used as source into a file basenamed by the notebook_filename argument. """ if input_buffer isNone: self.log.info("Converting notebook %s to %s", notebook_filename, self.export_format) else: self.log.info("Converting notebook into %s", self.export_format) resources = self.init_single_notebook_resources(notebook_filename) output, resources = self.export_single_notebook(notebook_filename, resources, input_buffer=input_buffer) write_results = self.write_single_notebook(output, resources) self.postprocess_single_notebook(write_results)
definit_single_notebook_resources(self, notebook_filename): """Step 1: Initialize resources This initializes the resources dictionary for a single notebook. Returns ------- dict resources dictionary for a single notebook that MUST include the following keys: - config_dir: the location of the Jupyter config directory - unique_key: the notebook name - output_files_dir: a directory where output files (not including the notebook itself) should be saved """ basename = os.path.basename(notebook_filename) notebook_name = basename[:basename.rfind('.')] if self.output_base: # strip duplicate extension from output_base, to avoid Basename.ext.ext if getattr(self.exporter, 'file_extension', False): base, ext = os.path.splitext(self.output_base) if ext == self.exporter.file_extension: self.output_base = base notebook_name = self.output_base
self.log.debug("Notebook name is '%s'", notebook_name)
# first initialize the resources we want to use resources = {} resources['config_dir'] = self.config_dir resources['unique_key'] = notebook_name
return resources ``` # Export the notebook to a particular format 接下來就是要把來源的檔案給放到 **Exporter** 去處理格式轉換的問題,但是這邊如果我們直接傳入的是一個本來的 **.ipynb** 的檔案的話,我們會沒有辦法能夠針對 **IPython** 部份去進行修改。
```python defexport_single_notebook(self, notebook_filename, resources, input_buffer=None): """Step 2: Export the notebook Exports the notebook to a particular format according to the specified exporter. This function returns the output and (possibly modified) resources from the exporter. Parameters ---------- notebook_filename : str name of notebook file. resources : dict input_buffer : readable file-like object returning unicode. if not None, notebook_filename is ignored Returns ------- output dict resources (possibly modified) """ try: if input_buffer isnotNone: output, resources = self.exporter.from_file(input_buffer, resources=resources) else: output, resources = self.exporter.from_filename(notebook_filename, resources=resources) except ConversionException: self.log.error("Error while converting '%s'", notebook_filename, exc_info=True) self.exit(1)
defwrite_single_notebook(self, output, resources): """Step 3: Write the notebook to file This writes output from the exporter to file using the specified writer. It returns the results from the writer. Parameters ---------- output : resources : dict resources for a single notebook including name, config directory and directory to save output Returns ------- file results from the specified writer output of exporter """ if'unique_key'notin resources: raise KeyError("unique_key MUST be specified in the resources, but it is not")
#get the exporter for script format cls = get_exporter("script")
#initial the resources init_single_notebook_resources(...) #export the data to python script export_single_notebook(...,data) #writhe the output to file write_single_notebook(...)
在上篇文章中,我們專注於 The Control Plane 這邊相關的安全管理,而本篇文章我們則會專注於後續的兩個方向,分別是 Workloads 以及 The Future。
+
Workloads
相對於 The Control Plane 著重於整個集群架構上的安全問題, Workloads 則是著重於運行於 kubernetes 內的各種工作服務,如 Pods,Deployments, Jobs 等 雖然這些工作服務在部署階段是受到 kubernetes 經過檢查確認合法才會往下執行的,但是由於這些工作服務都會直接面向使用者,若這些工作容器內本身的權限過高且遭受到非預期的攻擊時,就會衍生出其他的安全性問題。
apiVersion:v1 kind:Pod metadata: name:hello-apparmor annotations: # Tell Kubernetes to apply the AppArmor profile "k8s-apparmor-example-deny-write". # Note that this is ignored if the Kubernetes node is not running version 1.4 or greater. container.apparmor.security.beta.kubernetes.io/hello:localhost/k8s-apparmor-example-deny-write spec: containers: - name:hello image:busybox command:["sh","-c","echo 'Hello AppArmor!' && sleep 1h"]
kubernetes-dashboard kubesec.io score 7 ----------------- Advise 1. containers[] .securityContext .runAsNonRoot == true Force the running image to run as a non-root user to ensure least privilege 2. containers[] .securityContext .capabilities .drop Reducing kernel capabilities available to a container limits its attack surface 3. containers[] .securityContext .readOnlyRootFilesystem == true An immutable root filesystem can prevent malicious binaries being added to PATH and increase attack cost 4. containers[] .securityContext .runAsUser > 10000 Run as a high-UID user to avoid conflicts with the host's user table 5. containers[] .securityContext .capabilities .drop | index("ALL") Drop all capabilities and add only those required to reduce syscall attack surface
+
其建議大概如下
+
+
不要用 Root去運行,應該修改 Container 改成 non-root 就可以執行
+
Linux Capabilityes 應該要移除掉沒有用到的部分
+
針對 / Root File System 應該只能夠唯獨就好
+
執行 User 的 UID 應該要超過 10000,可以避免該 UID 與本機端的 Host 衝突(主要是因為 User Namespace 還沒有完全實作於 k8s container 中)
# Required to prevent escalations to root. allowPrivilegeEscalation:false runAsUser: # Require the container to run without root privileges. rule:'MustRunAsNonRoot'
+
最後作者提到,到 User Namespace 隔離的功能完成之前,使用 Non-Root 身份去運行容器依然是一個必要且不可以避免的選擇,為了整個集群以及容器的安全性,還是需要多花時間進行轉換。
但是什麼叫做 提供網路能力, 這個部分我認為沒有定義,畢竟誰說網路一定是走 IPv4 ? 誰說網路一定要至少到 Layer3 IP 難道不能 point ot point 互連嗎? 主要是因為這部分的功能特性對於大部分的使用者都沒有需求,而目前最常見也是 IPv4 + TCP/UDP 的傳輸方式,因此才會看到大部分的 CNI 都在講這些。
dnsType, err := getPodDNSType(pod) if err != nil { glog.Errorf("Failed to get DNS type for pod %q: %v. Falling back to DNSClusterFirst policy.", format.Pod(pod), err) dnsType = podDNSCluster } switch dnsType { case podDNSNone: // DNSNone should use empty DNS settings as the base. dnsConfig = &runtimeapi.DNSConfig{} case podDNSCluster: ... case podDNSHost: // When the kubelet --resolv-conf flag is set to the empty string, use // DNS settings that override the docker default (which is to use // /etc/resolv.conf) and effectively disable DNS lookups. According to // the bind documentation, the behavior of the DNS client library when // "nameservers" are not specified is to "use the nameserver on the // local machine". A nameserver setting of localhost is equivalent to // this documented behavior. if c.ResolverConfig == "" { switch { case c.nodeIP == nil || c.nodeIP.To4() != nil: dnsConfig.Servers = []string{"127.0.0.1"} case c.nodeIP.To16() != nil: dnsConfig.Servers = []string{"::1"} } dnsConfig.Searches = []string{"."} } } ... return c.formDNSConfigFitsLimits(dnsConfig, pod), nil }
func(ds *dockerService)RunPodSandbox(ctx context.Context, r *runtimeapi.RunPodSandboxRequest)(*runtimeapi.RunPodSandboxResponse, error) { config := r.GetConfig() .... // Rewrite resolv.conf file generated by docker. // NOTE: cluster dns settings aren't passed anymore to docker api in all cases, // not only for pods with host network: the resolver conf will be overwritten // after sandbox creation to override docker's behaviour. This resolv.conf // file is shared by all containers of the same pod, and needs to be modified // only once per pod. if dnsConfig := config.GetDnsConfig(); dnsConfig != nil { containerInfo, err := ds.client.InspectContainer(createResp.ID) if err != nil { returnnil, fmt.Errorf("failed to inspect sandbox container for pod %q: %v", config.Metadata.Name, err) }
if err := rewriteResolvFile(containerInfo.ResolvConfPath, dnsConfig.Servers, dnsConfig.Searches, dnsConfig.Options); err != nil { returnnil, fmt.Errorf("rewrite resolv.conf failed for pod %q: %v", config.Metadata.Name, err) } } .... return resp, nil }
func(daemon *Daemon)ContainerStart(name string, hostConfig *containertypes.HostConfig, checkpoint string, checkpointDir string)error { ... // check if hostConfig is in line with the current system settings. // It may happen cgroups are umounted or the like. if _, err = daemon.verifyContainerSettings(container.OS, container.HostConfig, nil, false); err != nil { return errdefs.InvalidParameter(err) } // Adapt for old containers in case we have updates in this function and // old containers never have chance to call the new function in create stage. if hostConfig != nil { if err := daemon.adaptContainerSettings(container.HostConfig, false); err != nil { return errdefs.InvalidParameter(err) } } return daemon.containerStart(container, checkpoint, checkpointDir, true) }
// containerStart prepares the container to run by setting up everything the // container needs, such as storage and networking, as well as links // between containers. The container is left waiting for a signal to // begin running. func(daemon *Daemon)containerStart(container *container.Container, checkpoint string, checkpointDir string, resetRestartManager bool)(err error) {
// containerStart prepares the container to run by setting up everything the // container needs, such as storage and networking, as well as links // between containers. The container is left waiting for a signal to // begin running.
+
+
再這個函式內會創建好相關的容器,並且會將該容器用到的相關資源(儲存/網路)等都準備好 ,由於我們要觀察的是 DNS 相關的資訊,所以我們要繼續往 initializeNetworking 的方向往下追。
// always connect default network first since only default // network mode support link and we need do some setting // on sandbox initialize for link, but the sandbox only be initialized // on first network connecting. defaultNetName := runconfig.DefaultDaemonNetworkMode().NetworkName() if nConf, ok := container.NetworkSettings.Networks[defaultNetName]; ok { cleanOperationalData(nConf) if err := daemon.connectToNetwork(container, defaultNetName, nConf.EndpointSettings, updateSettings); err != nil { return err }
func(sb *sandbox)setupDNS()error { var newRC *resolvconf.File ....
originResolvConfPath := sb.config.originResolvConfPath if originResolvConfPath == "" { // if not specified fallback to default /etc/resolv.conf originResolvConfPath = resolvconf.DefaultResolvConf } currRC, err := resolvconf.GetSpecific(originResolvConfPath) if err != nil { if !os.IsNotExist(err) { return err } // it's ok to continue if /etc/resolv.conf doesn't exist, default resolvers (Google's Public DNS) // will be used currRC = &resolvconf.File{} logrus.Infof("/etc/resolv.conf does not exist") }
iflen(sb.config.dnsList) > 0 || len(sb.config.dnsSearchList) > 0 || len(sb.config.dnsOptionsList) > 0 { var ( err error dnsList = resolvconf.GetNameservers(currRC.Content, types.IP) dnsSearchList = resolvconf.GetSearchDomains(currRC.Content) dnsOptionsList = resolvconf.GetOptions(currRC.Content) ) iflen(sb.config.dnsList) > 0 { dnsList = sb.config.dnsList } iflen(sb.config.dnsSearchList) > 0 { dnsSearchList = sb.config.dnsSearchList } iflen(sb.config.dnsOptionsList) > 0 { dnsOptionsList = sb.config.dnsOptionsList } newRC, err = resolvconf.Build(sb.config.resolvConfPath, dnsList, dnsSearchList, dnsOptionsList) if err != nil { return err } // After building the resolv.conf from the user config save the // external resolvers in the sandbox. Note that --dns 127.0.0.x // config refers to the loopback in the container namespace sb.setExternalResolvers(newRC.Content, types.IPv4, false) } else { // If the host resolv.conf file has 127.0.0.x container should // use the host resolver for queries. This is supported by the // docker embedded DNS server. Hence save the external resolvers // before filtering it out. sb.setExternalResolvers(currRC.Content, types.IPv4, true)
// Replace any localhost/127.* (at this point we have no info about ipv6, pass it as true) if newRC, err = resolvconf.FilterResolvDNS(currRC.Content, true); err != nil { return err } // No contention on container resolv.conf file at sandbox creation if err := ioutil.WriteFile(sb.config.resolvConfPath, newRC.Content, filePerm); err != nil { return types.InternalErrorf("failed to write unhaltered resolv.conf file content when setting up dns for sandbox %s: %v", sb.ID(), err) } }
// Write hash if err := ioutil.WriteFile(sb.config.resolvConfHashFile, []byte(newRC.Hash), filePerm); err != nil { return types.InternalErrorf("failed to write resolv.conf hash file when setting up dns for sandbox %s: %v", sb.ID(), err) }
returnnil }
+
這個函式會針對一些跟 DNS 相關的參數來進行處理,包含了
+
+
dnsServer
+
dnsSearch
+
dnsOptions
+
resolveConf
+
+
這邊的運作邏輯如下
+
+
先根據參數resolveConf來讀取當前 DNS 的全部設定
+
如果使用者有自行設定 DNS 的參數,就會全面使用這邊的設定,完全忽略(1)載入的設定 2.1 這邊最後會呼叫 resolvconf.Build 將參數的設定直接覆寫到容器內的 /etc/resolv.conf
// FilterResolvDNS cleans up the config in resolvConf. It has two main jobs: // 1. It looks for localhost (127.*|::1) entries in the provided // resolv.conf, removing local nameserver entries, and, if the resulting // cleaned config has no defined nameservers left, adds default DNS entries // 2. Given the caller provides the enable/disable state of IPv6, the filter // code will remove all IPv6 nameservers if it is not enabled for containers // func FilterResolvDNS(resolvConf []byte, ipv6Enabled bool) (*File, error) { cleanedResolvConf := localhostNSRegexp.ReplaceAll(resolvConf, []byte{}) // if IPv6 is not enabled, also clean out any IPv6 address nameserver if !ipv6Enabled { cleanedResolvConf = nsIPv6Regexp.ReplaceAll(cleanedResolvConf, []byte{}) } // if the resulting resolvConf has no more nameservers defined, add appropriate // default DNS servers for IPv4 and (optionally) IPv6 if len(GetNameservers(cleanedResolvConf, types.IP)) == 0 { logrus.Infof("No non-localhost DNS nameservers are left in resolv.conf. Using default external servers: %v", defaultIPv4Dns) dns := defaultIPv4Dns if ipv6Enabled { logrus.Infof("IPv6 enabled; Adding default IPv6 external servers: %v", defaultIPv6Dns) dns = append(dns, defaultIPv6Dns...) } cleanedResolvConf = append(cleanedResolvConf, []byte("\n"+strings.Join(dns, "\n"))...) } hash, err := ioutils.HashData(bytes.NewReader(cleanedResolvConf)) if err != nil { return nil, err } return &File{Content: cleanedResolvConf, Hash: hash}, nil }
+
+
首先先呼叫 ReplaceAll 把所有 localhost 127.0.0.0/8 相關的 IP 都清空。
+
清空之後,若發現這時候沒有 DNS 的話,直接透過 dns := defaultIPv4Dns 補上預設的 DNS (8.8.8.8/8.8.4.4)
DNS 在傳統的網路架構中一直扮演很重要的角色,可以讓用戶端透過 FQDN 的方式去存取目標的伺服器端,不需要去寫死對方的 IP 地址。然而在 kubernetes 的架構中, kubernetes 預設就會建立一套 DNS server 讓所有創建的 Pod 來使用。對於一般的使用者來說,只要能夠存取外面網路以及 Kubernetes Service 相關即可,然而在某些特殊的應用情境下,譬如目前喊行之有年的 NFV 架構中,我們的服務(Pod)本身可能會需要更複雜的網路架構,譬如同時存在 Data Network/Control Network. 這情況下,我們的 Pod 會需要特別去處理自己裡面服務所使用的 DNS Server 。 本文主要針對 Pod 裡面關於 DNS 相關的設定進行介紹,並且透過實際部屬 Yaml 的方式來看看到底如何使用。
在Kubernetes 裡面看到 DNS 這個字眼,實際上可以想到非常多相關的元件與功能, 譬如用來提供 kubernetes 集群內服務的 kube-DNS, 或是透過 kubernetes service 產生之獨一無二的 FQDN 名稱,最後就是本篇文章想要分享的一個元件, Pod 內的 DNS 設定。
+
在大部分的使用情境之下,通常不會去關心這個部分,直接套用 kubernetes 預設的設定幫忙把相關的給處理完畢,然而隨者 kubernetes 愈來愈熱門,使用的情境愈來愈多。 譬如跟 NFV(Network Function Virtualization) 有關的應用中,我們所運行的任何應用程式 (Pod) 可能就會有 DNS 相關的需求。
+
接下來我們使用下面這張架構圖來說明可能的使用情境
+
Introduction
The Reason Why We Need This
一般的使用情境下,我們的kubernetes 的集群使用方式就如同圖片中紫色/粉紅色(Pod3)區塊一樣,所有的 Pod 如果有任何要存取DNS的需求,都會透過集群內K8S-DNS來處理對應的請求與回覆。
DNSConfig 意味可以讓操作者延伸當前 Pod 內關於 DNS 的設定,這邊要特別注意的是,我使用的字眼是 延伸 而非 設定,這是因為透過下個章節的 DNSPOlicy, 每個 Pod 都會有一組預設的 DNS 設定。 透過 DNSConfig 我們可以繼續往上疊加相關的 DNS 參數到 Pod 之中。 目前總共支援三個參數可以設定,分別是
+
+
nameservers:
+
searches:
+
options:
+
+
這三個參數其實就是對應到大家熟悉的 /etc/resolv.conf 裡面的三個參數,這邊就不針對 DNS 進行介紹,不熟悉的朋友可以自行在去 Google 學一下這些參數。
DNSConfig 非常簡單直覺,如果你有自己需要的 DNS 參數需要使用,就可以透過這個欄位來設定。
+
DNSSPolicy
前面提過, DNSConfig 提供的是延伸 Pod 內預設的 DNS 設定,而 DNSPolicy 就是決定 Pod 內預設的 DNS 設定有哪些。 目前總共有四個類型可以選擇
+
+
None
+
Default
+
ClusterFirst
+
ClusterFirstHostNet
+
+
接下來針對這四個選項分別介紹
+
None
None 的意思就如同字面上一樣,將會清除 Pod 預設的 DNS 設定,於此狀況下, Kubernetes 不會幫使用者的 Pod 預先載入任何自身邏輯判斷得到的 DNS 設定。 但是為了避免一個 Pod 裡面沒有任何的 DNS 設定存在,因此若使用這個 None的話,則一定要設定 DNSConfig 來描述自定義的 DNS 參數。
首先,我們先觀察本機上面的 DNS 設定,這邊因為我的 kubernetes 集群只有一台,所以我可以確保該 Pod 一定會運行在我這台機器上。
+
1 2 3 4
vagrant@vortex-dev:~/kubeDemo/dns/dnsSetting$ cat /etc/resolv.conf # Dynamic resolv.conf(5) file for glibc resolver(3) generated by resolvconf(8) # DO NOT EDIT THIS FILE BY HAND -- YOUR CHANGES WILL BE OVERWRITTEN nameserver 10.0.2.3
+
這時可以觀察到,機器上本來的DNS設定非常簡單,只有單純的 10.0.2.3。 接下來我們觀察該 Pod 內的 DNS 設定
首先,因為 ClusterFirst 使用的是 kube-dns 的 clusterIP 作為 DNS 的位置,所以我們先透過下列指令觀察 kube-dns 的 IP 位置
+
1 2 3
vagrant@vortex-dev:~/kubeDemo/dns/dnsSetting$ kubectl -n kube-system get svc kube-dns NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP 19h
funcgetPodDNSType(pod *v1.Pod)(podDNSType, error) { dnsPolicy := pod.Spec.DNSPolicy switch dnsPolicy { case v1.DNSNone: if utilfeature.DefaultFeatureGate.Enabled(features.CustomPodDNS) { return podDNSNone, nil } // This should not happen as kube-apiserver should have rejected // setting dnsPolicy to DNSNone when feature gate is disabled. return podDNSCluster, fmt.Errorf(fmt.Sprintf("invalid DNSPolicy=%v: custom pod DNS is disabled", dnsPolicy)) case v1.DNSClusterFirstWithHostNet: return podDNSCluster, nil case v1.DNSClusterFirst: if !kubecontainer.IsHostNetworkPod(pod) { return podDNSCluster, nil } // Fallback to DNSDefault for pod on hostnetowrk. fallthrough case v1.DNSDefault: return podDNSHost, nil } // This should not happen as kube-apiserver should have rejected // invalid dnsPolicy. return podDNSCluster, fmt.Errorf(fmt.Sprintf("invalid DNSPolicy=%v", dnsPolicy)) }
+
這邊可以看到一旦是 DNSClusterFirst 的情況下,若有設定 HostNetwork, 最後就會直節回傳 podDNSHost 節點的 DNS 設定回去。
+
為了解決上述的問題,所以引進了一個新的型態 ClusterFirstHostNet
+
ClusterFirstHostNet
ClusterFirstHostNet 用途非常簡單,我希望滿足使用 HostNetwork 同時使用 kube-dns 作為我 Pod 預設 DNS 的設定。
+
根據上面的程式碼也可以觀察到
1 2
case v1.DNSClusterFirstWithHostNet: return podDNSCluster, nil
To setup a kubernetes, there're so many approaches, including what infrstructure(VM/Container/Native) about the kubernetes and what tooles(kubeadm,kubespray,minikube). I choose use the kubeadm to install a native kubernetes in my laptop(Fedora) and you will see the whole setps to install all requirements, including the docker/kubernetes.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Preface
For some reasons, I had swithced the my laptop from the MAC Book Pro to another laptop with the Fedora 28.
+
In order to play the kubernetes with anything I want, I need to install the kubernetes in the Fedora 28.
+
There’re two approachs I can operate the kubernetes in my laptop.
+
+
Create a Ubuntu VM in the fedora 28 and install the kubernetes in that Ubuntu, just like how I play the kubernetes in the previous MAC OSX environment.
+
Install the kubernetes into the native Fedora 28 OS, it seems ok since there’re many posts about Fedora and kubernetes in the internet.
+
+
So, I choose the second approach to install the kubernetes because it’s new to me and I want to git it a try. That’s why this post exist and I will note every commands I used to install the kubernetes.
+
Installation Choices
There’re so many ways to install the kubernetes into a target system.
+
+
Use the kubernetes hard-way to install all components, including etcd,kubelet, kube-apiserver.
+
Use the minikube to install a single-node kubernetes(by default).
+
In fact, the minikube will lunch a VM to run a kubernetes but you can change to container-only method.
+
+
+
Use the ansible-liked method to help you install all necessary process/daemon we need.
+
Use the very simple official tool kubeadm to setup a kubernetes.
+
+
Just considered what subject I want to study in the kubernetes, I think the kubeadm is enough and I will choose it for the following tutorial.
+
Environment
First, we need to prepare the container environmnet for the kubernetes, and I choose to use the docker for my kubernetes.
+
If you use the default repository to search the docker, the version is about 13.1 and that it not what we want.
+
You can use the following command to check the latest docker version in the remote repository. Besides, the command sudo dnf list --showduplicates docker\* to show all version of the docker.
For the latest docker version, we should install the docker-ce (docker community edition) in the system and we can find the detail tutorail in the official document.
+
Remember, you need to remove all installed docker before you install the docker-ce and you can use the following command to remove the existing docker packages.
If you want to install the specific version of kubernetes tools, use the sudo dnf list --showduplicates kube\* to see all verions. Otherwise, use the following comand to install the latest verion of kubernetes.
+
We need to use the --disableexcludes to disable the exclude list of kubernetes, since all tools are in that kubernetes repo. After installing that, we also need to enable the systemd service for kubelet.
Now, I will ues the kubeadm to install the kubernetes environment now. The important thing is that you need to take care the parameter of your kubeadm process, if you choose the flannel as your CNI(Containre Network Interface). You use pass the --pod-network-cidr=10.244.0.0/16 and the flannel can use that to choose the IP address range of each node.
+
This init procedure takes time to install whole kubernetes, the most part of that is used to download the dokcer image for all kubernetes components.
sudo kubeadm init --pod-network-cidr=10.244.0.0/16 [init] using Kubernetes version: v1.11.3 [preflight] running pre-flight checks I0923 23:45:54.958338 16222 kernel_validator.go:81] Validating kernel version I0923 23:45:54.958582 16222 kernel_validator.go:96] Validating kernel config [WARNING SystemVerification]: docker version is greater than the most recently validated version. Docker version: 18.06.1-ce. Max validated version: 1 7.03 [preflight/images] Pulling images required for setting up a Kubernetes cluster [preflight/images] This might take a minute or two, depending on the speed of your internet connection [preflight/images] You can also perform this action in beforehand using 'kubeadm config images pull' [kubelet] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env" [kubelet] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml" [preflight] Activating the kubelet service [certificates] Generated ca certificate and key. [certificates] Generated apiserver certificate and key. [certificates] apiserver serving cert is signed for DNS names [localhost.localdomain kubernetes kubernetes.default kubernetes.default.svc kubernetes.default.s vc.cluster.local] and IPs [10.96.0.1 10.0.4.63] [certificates] Generated apiserver-kubelet-client certificate and key. [certificates] Generated sa key and public key. [certificates] Generated front-proxy-ca certificate and key. [certificates] Generated front-proxy-client certificate and key. [certificates] Generated etcd/ca certificate and key. [certificates] Generated etcd/server certificate and key. [certificates] etcd/server serving cert is signed for DNS names [localhost.localdomain localhost] and IPs [127.0.0.1 ::1] [certificates] Generated etcd/peer certificate and key. [certificates] etcd/peer serving cert is signed for DNS names [localhost.localdomain localhost] and IPs [10.0.4.63 127.0.0.1 ::1] [certificates] Generated etcd/healthcheck-client certificate and key. [certificates] Generated apiserver-etcd-client certificate and key. [certificates] valid certificates and keys now exist in"/etc/kubernetes/pki" [kubeconfig] Wrote KubeConfig file to disk: "/etc/kubernetes/admin.conf" [kubeconfig] Wrote KubeConfig file to disk: "/etc/kubernetes/kubelet.conf" [kubeconfig] Wrote KubeConfig file to disk: "/etc/kubernetes/controller-manager.conf" [kubeconfig] Wrote KubeConfig file to disk: "/etc/kubernetes/scheduler.conf" [controlplane] wrote Static Pod manifest for component kube-apiserver to "/etc/kubernetes/manifests/kube-apiserver.yaml" [controlplane] wrote Static Pod manifest for component kube-controller-manager to "/etc/kubernetes/manifests/kube-controller-manager.yaml" [controlplane] wrote Static Pod manifest for component kube-scheduler to "/etc/kubernetes/manifests/kube-scheduler.yaml" [etcd] Wrote Static Pod manifest for a local etcd instance to "/etc/kubernetes/manifests/etcd.yaml" [init] waiting for the kubelet to boot up the control plane as Static Pods from directory "/etc/kubernetes/manifests" [init] this might take a minute or longer if the control plane images have to be pulled [apiclient] All control plane components are healthy after 38.003505 seconds [uploadconfig] storing the configuration used in ConfigMap "kubeadm-config"in the "kube-system" Namespace [kubelet] Creating a ConfigMap "kubelet-config-1.11"in namespace kube-system with the configuration for the kubelets in the cluster [markmaster] Marking the node localhost.localdomain as master by adding the label "node-role.kubernetes.io/master=''" [markmaster] Marking the node localhost.localdomain as master by adding the taints [node-role.kubernetes.io/master:NoSchedule] [patchnode] Uploading the CRI Socket information "/var/run/dockershim.sock" to the Node API object "localhost.localdomain" as an annotation [bootstraptoken] using token: 9s11hf.6ooyk6587vqeirn7
[bootstraptoken] configured RBAC rules to allow Node Bootstrap tokens to post CSRs in order for nodes to get long term certificate credentials [bootstraptoken] configured RBAC rules to allow the csrapprover controller automatically approve CSRs from a Node Bootstrap Token [bootstraptoken] configured RBAC rules to allow certificate rotation for all node client certificates in the cluster [bootstraptoken] creating the "cluster-info" ConfigMap in the "kube-public" namespace [addons] Applied essential addon: CoreDNS [addons] Applied essential addon: kube-proxy
Your Kubernetes master has initialized successfully!
To start using your cluster, you need to run the following as a regular user:
You should now deploy a pod network to the cluster. Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at: https://kubernetes.io/docs/concepts/cluster-administration/addons/
You can now join any number of machines by running the following on each node as root:
With that kubeconfig, we can use the kubectl to control the kubernetes now and you can use the kubectl get nodes to see the status of the node.
+
Since we have not installed the CNI in the cluster, the status is NotReday and it will become the Ready once you install any CNI into it.
1 2 3 4 5 6 7 8 9 10 11 12 13 14
hwchiu➜~» kubectl get nodes [23:50:27] NAME STATUS ROLES AGE VERSION localhost.localdomain NotReady master 3m v1.11.3
hwchiu➜~» kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/v0.9.1/Documentation/kube-flannel.yml [23:50:39] clusterrole.rbac.authorization.k8s.io/flannel created clusterrolebinding.rbac.authorization.k8s.io/flannel created serviceaccount/flannel created configmap/kube-flannel-cfg created daemonset.extensions/kube-flannel-ds created
hwchiu➜~» kubectl get nodes [23:51:30] NAME STATUS ROLES AGE VERSION localhost.localdomain Ready master 5m v1.11.3
+
Deploy Pod
Unfortunately, we still can’t run a Pod successfully, the reason is that the master node doesn’t be allowed to deploy any Pods if we use the kubeadm to setup the kubernetes cluster. In my condition, I have only one node and it must be master node, that’s why any user-defined Pod will be pending forever.
+
The mechanism about that limitation is the taint and we can use the following command to remove the taint rules of all master node.
Now, we can deploy a Pod and enjoy your kubernetes cluster.
1 2
kubectl run test --image=hwchiu/netutils kubectl get pods -o wide -w
+
Helm Chart (Optional)
There’s a package system like the deb or something else in the kubernetes ecosystem, and it’s called Helm Chart. You can use that to install some predefined packages which contains all resources you need, such as the deployment,service,configmap,RBAC and others into your kubernetes cluster.
+
Again, it’s optional tool and you can install it if you need it.
+
You can follow the official document to install the helm chart but you will meet some problems.
+
I will note those problems and share how I solve those problems.
+
Install
Since we are use the Fedora now, the helm has not been takend by any dnf repos, we need to download the binary from the official website and you can use the following script to download the binary automatically.
You will be informed to use the helm init to initial the helm environment in your kubernetes cluster after executing the get_helm.sh, but don’t do that now.
+
Since we use the kuberadm to install the kubernetes and it use the RBAC mode as default. We need to add another parameter for the RBAC mode.
+
You need to prepare the RBAC config for your helm chart service before you using it to install any packages. And you can find the whole tutoral here
+
After create the RBAC config, use the follwing command to init your helm
1
helm init --service-account tiller
+
Test
Now, using the following command to install the nginx ingress package by helm.
於 kubernetes 叢集中,我們會部屬大量的容器應用程式,而這些應用程式有些是面向使用者,也有些是彼此容器間互相溝通使用的.舉例來說,管理人員可能會在叢集中佈署了相關的資料庫容器服務,而其他的應用服務就必須要透過網路連線的方式來存取該資料庫.為了要透過網路的方式存取,就意味要使用 IP 地址的方式來存取對應的容器服務,然而在 kubernetes 的叢集中,這些 Pod 重啟後預設情況下都會拿到不相同的 IP 地址, 這意味我們的客戶端應用程式就沒有辦法寫死這些 IP 地址來存取,必須要有更動態的方式讓客戶端應用程式可以取得當前目標容器(資料庫)的最新 IP 地址. 為了解決這個問題, 我們可以透過 kubernetes service 的架構來達成上述的目的。本文會跟大家介紹什麼是 kubernetes service 以及透過實際範例介紹該如何使用
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Preface
本文章是屬於 kubernetes service 系列文之一,該系列文希望能夠與大家討論下列兩個觀念
+
+
什麼是 Kubernetes Service, 為什麼我們需要它? 它能夠幫忙解決什麼問題
+
Kubernetes Service 是怎麼實現的?, 讓我們用 Iptables 來徹徹底底的理解他
vortex-dev:04:48:50 [~/go/src/github.com/hwchiu/kubeDemo](master)vagrant $kubectlexec ubuntu curl -- -s k8s-nginx-cluster.default <!DOCTYPE html> <html> <head> <title>Welcome to nginx!</title> <style> body { width: 35em; margin: 0 auto; font-family: Tahoma, Verdana, Arial, sans-serif; } </style> </head> <body> <h1>Welcome to nginx!</h1> <p>If you see this page, the nginx web server is successfully installed and working. Further configuration is required.</p>
<p>For online documentation and support please refer to <a href="http://nginx.org/">nginx.org</a>.<br/> Commercial support is available at <a href="http://nginx.com/">nginx.com</a>.</p>
<p><em>Thank you for using nginx.</em></p> </body> </html>
+
非常順利的存取到網頁了,這時候如果想要從節點本身(非叢集應用程式)去存取看看呢?
1 2 3
vortex-dev:04:54:37 [~/go/src/github.com/hwchiu/kubeDemo](master)vagrant $curl k8s-nginx-cluster.default curl: (6) Could not resolve host: k8s-nginx-cluster.default
vortex-dev:05:03:44 [~/go/src/github.com/hwchiu/kubeDemo](master)vagrant $curl 172.17.8.100:32293 <!DOCTYPE html> <html> <head> <title>Welcome to nginx!</title> <style> body { width: 35em; margin: 0 auto; font-family: Tahoma, Verdana, Arial, sans-serif; } </style> </head> <body> <h1>Welcome to nginx!</h1> <p>If you see this page, the nginx web server is successfully installed and working. Further configuration is required.</p>
<p>For online documentation and support please refer to <a href="http://nginx.org/">nginx.org</a>.<br/> Commercial support is available at <a href="http://nginx.com/">nginx.com</a>.</p>
<p><em>Thank you for using nginx.</em></p> </body> </html>
+
Summary
本章節中,我們介紹了 Kubernetes Serive, 為什麼需要 Service 以及 Service 如何解決我們的問題 同時介紹了常用的 ClusterIP 以及 NodePort 這兩種類型的差異以及概念 最後透過幾個簡單的範例展示下如何使用 ClusterIP/NodePort 讓我們能夠更方便的透過 service 去存取我們的後端服務
vortex-dev:05:43:54 [~/go/src/github.com/hwchiu/kubeDemo](master)vagrant $kubectl get endpoints k8s-nginx-cluster NAME ENDPOINTS AGE k8s-nginx-cluster 10.244.0.88:80,10.244.0.89:80,10.244.0.90:80 1d
vortex-dev:03:34:14 [~]vagrant $kubectl get svc NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE k8s-nginx-node NodePort 10.98.128.179 <none> 80:30136/TCP 1d
vortex-dev:03:43:42 [~]vagrant $sudo iptables-save | grep "\-j KUBE-NODEPORTS" -A KUBE-SERVICES -m comment --comment "kubernetes service nodeports; NOTE: this must be the last rule in this chain" -m addrtype --dst-type LOCAL -j KUBE-NODEPORTS
我們用下列指令確認一下剛剛部屬的 kubernetes service 是否真的有設定 sessionAffinity
1 2 3
vortex-dev:01:40:58 [~/go/src/github.com/hwchiu/kubeDemo/services](master)vagrant $kubectl get service k8s-nginx-affinity -o jsonpath='{.spec.sessionAffinity}' ClientIP
type PersistentVolumeSource struct { // GCEPersistentDisk represents a GCE Disk resource that is attached to a // kubelet's host machine and then exposed to the pod. Provisioned by an admin. // More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk // +optional GCEPersistentDisk *GCEPersistentDiskVolumeSource `json:"gcePersistentDisk,omitempty" protobuf:"bytes,1,opt,name=gcePersistentDisk"` // AWSElasticBlockStore represents an AWS Disk resource that is attached to a // kubelet's host machine and then exposed to the pod. // More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore // +optional AWSElasticBlockStore *AWSElasticBlockStoreVolumeSource `json:"awsElasticBlockStore,omitempty" protobuf:"bytes,2,opt,name=awsElasticBlockStore"` // HostPath represents a directory on the host. // Provisioned by a developer or tester. // This is useful for single-node development and testing only! // On-host storage is not supported in any way and WILL NOT WORK in a multi-node cluster. // More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath // +optional HostPath *HostPathVolumeSource `json:"hostPath,omitempty" protobuf:"bytes,3,opt,name=hostPath"` // Glusterfs represents a Glusterfs volume that is attached to a host and // exposed to the pod. Provisioned by an admin. // More info: https://releases.k8s.io/HEAD/examples/volumes/glusterfs/README.md // +optional Glusterfs *GlusterfsPersistentVolumeSource `json:"glusterfs,omitempty" protobuf:"bytes,4,opt,name=glusterfs"` // NFS represents an NFS mount on the host. Provisioned by an admin. // More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs // +optional NFS *NFSVolumeSource `json:"nfs,omitempty" protobuf:"bytes,5,opt,name=nfs"` // RBD represents a Rados Block Device mount on the host that shares a pod's lifetime. // More info: https://releases.k8s.io/HEAD/examples/volumes/rbd/README.md // +optional RBD *RBDPersistentVolumeSource `json:"rbd,omitempty" protobuf:"bytes,6,opt,name=rbd"` // ISCSI represents an ISCSI Disk resource that is attached to a // kubelet's host machine and then exposed to the pod. Provisioned by an admin. // +optional ISCSI *ISCSIPersistentVolumeSource `json:"iscsi,omitempty" protobuf:"bytes,7,opt,name=iscsi"` // Cinder represents a cinder volume attached and mounted on kubelets host machine // More info: https://releases.k8s.io/HEAD/examples/mysql-cinder-pd/README.md // +optional Cinder *CinderPersistentVolumeSource `json:"cinder,omitempty" protobuf:"bytes,8,opt,name=cinder"` // CephFS represents a Ceph FS mount on the host that shares a pod's lifetime // +optional CephFS *CephFSPersistentVolumeSource `json:"cephfs,omitempty" protobuf:"bytes,9,opt,name=cephfs"` // FC represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. // +optional FC *FCVolumeSource `json:"fc,omitempty" protobuf:"bytes,10,opt,name=fc"` // Flocker represents a Flocker volume attached to a kubelet's host machine and exposed to the pod for its usage. This depends on the Flocker control service being running // +optional Flocker *FlockerVolumeSource `json:"flocker,omitempty" protobuf:"bytes,11,opt,name=flocker"` // FlexVolume represents a generic volume resource that is // provisioned/attached using an exec based plugin. // +optional FlexVolume *FlexPersistentVolumeSource `json:"flexVolume,omitempty" protobuf:"bytes,12,opt,name=flexVolume"` // AzureFile represents an Azure File Service mount on the host and bind mount to the pod. // +optional AzureFile *AzureFilePersistentVolumeSource `json:"azureFile,omitempty" protobuf:"bytes,13,opt,name=azureFile"` // VsphereVolume represents a vSphere volume attached and mounted on kubelets host machine // +optional VsphereVolume *VsphereVirtualDiskVolumeSource `json:"vsphereVolume,omitempty" protobuf:"bytes,14,opt,name=vsphereVolume"` // Quobyte represents a Quobyte mount on the host that shares a pod's lifetime // +optional Quobyte *QuobyteVolumeSource `json:"quobyte,omitempty" protobuf:"bytes,15,opt,name=quobyte"` // AzureDisk represents an Azure Data Disk mount on the host and bind mount to the pod. // +optional AzureDisk *AzureDiskVolumeSource `json:"azureDisk,omitempty" protobuf:"bytes,16,opt,name=azureDisk"` // PhotonPersistentDisk represents a PhotonController persistent disk attached and mounted on kubelets host machine PhotonPersistentDisk *PhotonPersistentDiskVolumeSource `json:"photonPersistentDisk,omitempty" protobuf:"bytes,17,opt,name=photonPersistentDisk"` // PortworxVolume represents a portworx volume attached and mounted on kubelets host machine // +optional PortworxVolume *PortworxVolumeSource `json:"portworxVolume,omitempty" protobuf:"bytes,18,opt,name=portworxVolume"` // ScaleIO represents a ScaleIO persistent volume attached and mounted on Kubernetes nodes. // +optional ScaleIO *ScaleIOPersistentVolumeSource `json:"scaleIO,omitempty" protobuf:"bytes,19,opt,name=scaleIO"` // Local represents directly-attached storage with node affinity // +optional Local *LocalVolumeSource `json:"local,omitempty" protobuf:"bytes,20,opt,name=local"` // StorageOS represents a StorageOS volume that is attached to the kubelet's host machine and mounted into the pod // More info: https://releases.k8s.io/HEAD/examples/volumes/storageos/README.md // +optional StorageOS *StorageOSPersistentVolumeSource `json:"storageos,omitempty" protobuf:"bytes,21,opt,name=storageos"` // CSI represents storage that handled by an external CSI driver (Beta feature). // +optional CSI *CSIPersistentVolumeSource `json:"csi,omitempty" protobuf:"bytes,22,opt,name=csi"` }
+
PersistentVolumeClaim(PVC)
PersistentVolume 是與後端儲存空間連接的叢集資源,而 PersistentVolumeClaim(PVC) 則是銜接 Pod 與 PV 的中介抽象層,可以說是容器本身對於儲存需求的資源請求。
在Openflow的協定中,有時候會想要知道每條`link`的`capacity`,然後就可以藉由當前的rate來判斷這個Link是否壅塞。 本文嘗試使用 `Floodlight` Controller 作為範例來展示如何使用透過預設的 API 來取得每個 Port 的資訊,並且從中計算出當前這條 Link 是否屬於壅塞或是閒置。 這類型的機制與資訊對於想要完成 Traffic Engineering 的開發者來說非常重要,畢竟這是其中一種可以幫每條連線加上權重的一種方式
/* Description of a physical port */ struct ofp_phy_port { uint16_t port_no; uint8_t hw_addr[OFP_ETH_ALEN]; char name[OFP_MAX_PORT_NAME_LEN]; /* Null-terminated */ uint32_t config; /* Bitmap of OFPPC_* flags. */ uint32_t state; /* Bitmap of OFPPS_* flags. */ /* Bitmaps of OFPPF_* that describe features. All bits zeroed if * unsupported or unavailable. */ uint32_t curr; /* Current features. */ uint32_t advertised; /* Features being advertised by the port. */ uint32_t supported; /* Features supported by the port. */ uint32_t peer; /* Features advertised by peer. */ };
In this article, I will write down a example to descrip how to setup the LXR(Linux Cross Reference) with multiple projects support. In this configuration, your can view multiple projects’ refernce in one LXR service. For this article, I use the Ceph and DRBD for my LXR projtects.
Decompress into the /opt direcotory (choose the path you like) and rename to lxr
+
+
+
Check the environment
+
+
1 2
cd /opt/lxr ./genxref --checkonly
+
The result will like below.
+
1 2 3 4 5 6 7 8 9 10 11 12 13 14
ERROR: could not open configuration file lxr.conf [ OK ] Perl version ... 5.22.1 Parameter 'ectagsbin' not defined - trying to find ctags ctags found at /usr/bin/ctags [ OK ] ctags version ... 5.9 Parameter 'glimpsebin' not defined - trying to find glimpse glimpse found at /usr/local/bin/glimpse Checked: glimpse version ... 4.18.7 Parameter 'glimpseindex' not defined - trying to find glimpseindex glimpseindex found at /usr/local/bin/glimpseindex Checked: glimpseindex version ... 4.18.7 Parameter 'swishbin' not defined - trying to find swish-e swish-e not found, `command -v swish-e` returned a null string genxref stopped without indexing by --checkonly option
+
Since we have not config the LXR, we won’t have the lxr.conf. We choose the glimpse as our search engine and we can ignore the warning of swish-e.
+
Configure LXR
+
Since the GIT type of source project doesn’t support the submodule reference in LXR, we use FILE instead of. Before we generate the code reference, we should update code by ourself.
+
Prepare the source fo DRBD and CEPH. I put them in /opt/lxr/source_code
+
We refer to the master branch of ceph, for rdbe, is version 9.0.
LXR root directory is /opt/lxr Configuration will be stored in custom.d/
Configure for single/multiple trees? [S/m] > m
*** LXR web server configuration ***
Many different configurations are possible, they are related to the way LXR service is accessed, i.e. to the structure of the URL. Refer to the User's Manual for a description of the variants.
LXR can be located at the server-root (so called dedicated) or lower in the server hierarchy (shared because there are usually other pages or sections). Server type? [dedicated/SHARED] > SHARED
Selecting which tree to display can be done in various ways: 1. from the host name (all names are different), 2. from a prefix to a common host name (similar to previous) 3. from the site section name (all different) 4. from interpretation of a section name part (similar to previous) 5. from the head of script arguments Method 5 is highly recommended because it has no impact on webserver configuration. Method 3 is second choice but involves manually setting up many symbolic links (one per source-tree). Method 1 & 2 do not involve symbolic links but need populating webserver configuration with virtual hosts. Note that method 2 does not work well on //localhost. Method 4 is deprecated because it has proved not easily portable under alternate webservers (other than Apache).
Tree designation?: ARGUMENT section name prefix in hos hostname embedded in section > ARGUMENT
The computer hosting the server is described by an URL. The form is scheme://host_name:port where: - scheme is either http or https (http: can be omitted), - host_name can be given as an IP address such as 123.45.67.89 or a domain name like localhost or lxr.url.example, - port may be omitted if standard for the scheme. --- Host name or IP? [//localhost] > //127.0.0.1 --- Alias name or IP? > URL section name for LXR in your server? [/lxr] > /lxr Will it be shared by all trees? [YES/no] >
*** LXR database configuration ***
The choice of the database engine can make a difference in indexing performance, but resource consumption is also an important factor. * For a small personal project, try SQLite which do not need a server and is free from configuration burden. * For medium to large projects, choice is between MySQL, PostgreSQL and Oracle. Oracle is not a free software, its interface has not been tested for a long time. * PostgreSQL databases are smaller than MySQL's and performance is roughly equivalent. * MySQL is at its best with large-sized projects (such as kernel cross-referencing) where it is fastest at the cost of bigger databases. * Take also in consideration the number of connected users. Database engine? [MYSQL/oracle/postgres/sqlite] > The safest option is to create one database per tree. You can however create a single database for all your trees with a specific set of tables for each tree (though this is not recommended). How do you setup the databases? [PER TREE/global] > All databases can be accessed with the same username and can also be described under the same names. Will you share database characteristics? [YES/no] > Will you use the same username and password for all DBs? [YES/no] > --- DB user name? [lxr] > lxr --- DB password ? [lxrpw] > lxrpw Will you give the same prefix to all tables? [YES/no] > --- Common table prefix? [lxr_] > --- Directory for glimpse databases? > /opt/lxr/glimpse_db
file .htaccess written into LXR root directory file apache2-require.pl written into configuration directory file apache-lxrserver.conf written into configuration directory file lighttpd-lxrserver.conf written into configuration directory file nginx-lxrserver.conf written into configuration directory file thttpd-lxrserver.conf written into configuration directory Mercurial support files written into configuration directory
*** LXR master configuration file setup *** Global section part
*** Configuring auxiliary tool paths *** Host name previously defined as http://104.154.246.9 *** Configuring HTML parameters *** 'Buttons-and-menus' interface is recommended for the kernel *** to avoid screen cluttering. --- Use 'buttons-and-menus' instead of 'link' interface? [YES/no] > *** Configuring file subsection *** Configuring "common factors" *** Marking tree section
*** LXR master configuration file setup *** Tree section part SQL script for database initialisation
*** Configuring LXR server parameters *** The virtual root is the fixed URL part after the hostname. *** You previously defined the virtual root as /lxr --- Caption in page header? (e.g. Project XYZZY displayed by LXR) > drbd Do you want a speed switch button for this tree ? [YES/no] > --- Short title for button? (e.g. XYZZY) > drbd --- Tree identification in URL? (e.g. the-tree) > drbd Do you need a specific encoding for this tree ? [yes/NO] > *** Describing tree location How is your tree stored? [FILES/cvs/git/svn/hg/bk] > *** A source directory contains one sub-directory for every version. --- Source directory? (e.g. /home/myself/project-tree) > /opt/lxr/source_code/drbd Name to display for the path root? (e.g. Project or $v for version) [$v] > *** Enumerating versions Label for version selection menu? [Version] > *** Versions can be explicitly enumerated, be read from a file or computed *** by a function. The latter case is recommended for VCS-stored trees. Version enumeration method? [LIST/file/function] > --- Version name? > No default choice, try again... --- Version name? > 0.9 --- Version name? (hit return to stop) > *** By default, first version in list is displayed. You may also indicate *** a prefered version. --- Default displayed version is first in 'range'? [YES/no] > *** Setting directory lists *** Some directories may contain non-public project data (binaries, *** compilers caches, SCM control data, ...). They can be hidden from LXR. --- Directory to ignore, e.g. CVSROOT or CVS? (hit return to stop) > *** If your source code uses "include" statements (#include, require, ...) *** LXR needs hints to resolve the destination file. --- Include directory, e.g. /include? (hit return to stop) > *** Configuring data storage --- Database name? > drbd Do you want to override the global 'lxr' user name? [yes/NO] > Do you want to override the global 'lxr_' table prefix? [yes/NO] >
*** Configure another tree? [YES/no] > , 'shortcaption' => 'drbd' *** Configuring LXR server parameters *** The virtual root is the fixed URL part after the hostname. *** You previously defined the virtual root as /lxr --- Caption in page header? (e.g. Project XYZZY displayed by LXR) > Ceph Do you want a speed switch button for this tree ? [YES/no] > --- Short title for button? (e.g. XYZZY) > Ceph --- Tree identification in URL? (e.g. the-tree) > Ceph Do you need a specific encoding for this tree ? [yes/NO] > *** Describing tree location How is your tree stored? [FILES/cvs/git/svn/hg/bk] > *** A source directory contains one sub-directory for every version. --- Source directory? (e.g. /home/myself/project-tree) > /opt/lxr/source_code/ceph/ Name to display for the path root? (e.g. Project or $v for version) [$v] > *** Enumerating versions Label for version selection menu? [Version] > *** Versions can be explicitly enumerated, be read from a file or computed *** by a function. The latter case is recommended for VCS-stored trees. Version enumeration method? [LIST/file/function] > --- Version name? > master --- Version name? (hit return to stop) > *** By default, first version in list is displayed. You may also indicate *** a prefered version. --- Default displayed version is first in 'range'? [YES/no] > *** Setting directory lists *** Some directories may contain non-public project data (binaries, *** compilers caches, SCM control data, ...). They can be hidden from LXR. --- Directory to ignore, e.g. CVSROOT or CVS? (hit return to stop) > *** If your source code uses "include" statements (#include, require, ...) *** LXR needs hints to resolve the destination file. --- Include directory, e.g. /include? (hit return to stop) > *** Configuring data storage --- Database name? > ceph Do you want to override the global 'lxr' user name? [yes/NO] > Do you want to override the global 'lxr_' table prefix? [yes/NO] >
在探討整個主題之前,我們先設計一個簡單的 schema 來符合這次 假設今天在資料庫內有主要結構,分別是 User 跟 Pod User 非常簡單,就是描述一個使用者 Pod 這邊不用在意他是什麼東西,他是一個資源,然後透過 User 創立的,所以每個 Pod 裡面都會有一個欄位去記錄是哪個 User 創立的。
classNode( object ): """A virtual network node is simply a shell in a network namespace. We communicate with it using pipes."""
portBase = 0# Nodes always start with eth0/port0, even in OF 1.0
def__init__( self, name, inNamespace=True, **params ): """name: name of node inNamespace: in network namespace? params: Node parameters (see config() for details)"""
# Make sure class actually works self.checkSetup()
self.name = name self.inNamespace = inNamespace
# Stash configuration parameters for future reference self.params = params
self.intfs = {} # dict of port numbers to interfaces self.ports = {} # dict of interfaces to port numbers # replace with Port objects, eventually ? self.nameToIntf = {} # dict of interface names to Intfs
defstart( self, controllers ): "Start up a new OVS OpenFlow switch using ovs-vsctl" if self.inNamespace: raise Exception( 'OVS kernel switch does not work in a namespace' ) # We should probably call config instead, but this # requires some rethinking... self.cmd( 'ifconfig lo up' ) # Annoyingly, --if-exists option seems not to work self.cmd( 'ovs-vsctl del-br', self ) self.cmd( 'ovs-vsctl add-br', self ) if self.datapath == 'user': self.cmd( 'ovs-vsctl set bridge', self,'datapath_type=netdev' ) int( self.dpid, 16 ) # DPID must be a hex string self.cmd( 'ovs-vsctl -- set Bridge', self, 'other_config:datapath-id=' + self.dpid ) self.cmd( 'ovs-vsctl set-fail-mode', self, self.failMode ) for intf in self.intfList(): ifnot intf.IP(): self.attach( intf ) # Add controllers clist = ' '.join( [ 'tcp:%s:%d' % ( c.IP(), c.port ) for c in controllers ] ) if self.listenPort: clist += ' ptcp:%s' % self.listenPort self.cmd( 'ovs-vsctl set-controller', self, clist ) # Reconnect quickly to controllers (1s vs. 15s max_backoff) for uuid in self.controllerUUIDs(): if uuid.count( '-' ) != 4: # Doesn't look like a UUID continue uuid = uuid.strip() self.cmd( 'ovs-vsctl set Controller', uuid, 'max_backoff=1000' )
defmakeIntfPair( intf1, intf2 ): """Make a veth pair connecting intf1 and intf2. intf1: string, interface intf2: string, interface returns: success boolean""" # Delete any old interfaces with the same names quietRun( 'ip link del ' + intf1 ) quietRun( 'ip link del ' + intf2 ) # Create new pair cmd = 'ip link add name ' + intf1 + ' type veth peer name ' + intf2 return quietRun( cmd )
"Basic interface object that can configure itself."
def __init__( self, name, node=None, port=None, link=None, **params ): """name: interface name (e.g. h1-eth0) node: owning node (where this intf most likely lives) link: parent link if we're part of a link other arguments are passed to config()""" self.node = node self.name = name self.link = link self.mac, self.ip, self.prefixLen = None, None, None # Add to node (and move ourselves if necessary ) node.addIntf( self, port=port ) # Save params for future reference self.params = params self.config( **params )
defconfig( self, mac=None, ip=None, defaultRoute=None, lo='up', **_params ): """Configure Node according to (optional) parameters: mac: MAC address for default interface ip: IP address for default interface ifconfig: arbitrary interface configuration Subclasses should override this method and call the parent class's config(**params)""" # If we were overriding this method, we would call # the superclass config method here as follows: # r = Parent.config( **_params ) r = {} self.setParam( r, 'setMAC', mac=mac ) self.setParam( r, 'setIP', ip=ip ) self.setParam( r, 'setDefaultRoute', defaultRoute=defaultRoute ) # This should be examined self.cmd( 'ifconfig lo ' + lo ) return r
VPN server is a very useful tool for your network connectivity, although there're many online VPN service around the world, it's slow speed and money cost and you can't sure they won't collect your connection data. That's why sometimes we want to build the VPN server by ourself and this porst introduce a way to setup a VPN server in your FreeBSD server.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Preface
Install a VPN server on FreeBSD 10.0-Release on Amazon EC2.
set user hwchiu 123456 used to config the admin’s accoutn and password of the web page.
+
set web self 0.0.0.0 5006 is the listen ip address and port of the web page.
+
+
1 2 3 4 5 6 7 8 9
startup: # configure mpd users set user hwchiu 123456 # configure the console set console self 127.0.0.1 5005 set console open # configure the web server set web self 0.0.0.0 5006 set web open
+
+
comment the dialup and add pptp_server, we will config the options of pptp_server later.
+
+
1 2 3
default: #load dialup load pptp_server
+
+
set ippool add pool1 ip_start, ip_end is used to set the private ip range for vpn user. the name ippool and pool1 must be the same as set ipcp ranges 192.168.1.1/32 ippool pool1
+
set ipcp ranges 192.168.1.1/32 ippool pool1 is the ip address of the server.
+
set ipcp dns 172.31.0.2 is used to set the dns server. In my case, since my machine is behind the EC2, i used the same configuration in my FreeBSD.
+
set ipcp nbns 172.31.0.2is used to for windows client.
+
set pptp self 172.31.18.110. You should set your ip address which is shown on the network interface.
# Define dynamic IP address pool. set ippool add pool1 192.168.1.50 192.168.1.99 # Create clonable bundle template named B create bundle template B set iface enable proxy-arp set iface idle 1800 set iface enable tcpmssfix set ipcp yes vjcomp # Specify IP address pool for dynamic assigment. set ipcp ranges 192.168.1.1/32 ippool pool1 set ipcp dns 172.31.0.2 set ipcp nbns 172.31.0.2 # The five lines below enable Microsoft Point-to-Point encryption # (MPPE) using the ng_mppc(8) netgraph node type. set bundle enable compression set ccp yes mppc set mppc yes e40 set mppc yes e128 set mppc yes stateless # Create clonable link template named L create link template L pptp # Set bundle template to use set link action bundle B # Multilink adds some overhead, but gives full 1500 MTU. set link enable multilink set link yes acfcomp protocomp set link no pap chap eap set link enable chap # We can use use RADIUS authentication/accounting by including # another config section with label 'radius'. # load radius set link keep-alive 10 60 # We reducing link mtu to avoid GRE packet fragmentation. set link mtu 1460 # Configure PPTP set pptp self 172.31.18.110 # Allow to accept calls set link enable incoming
+
+
+
Use configuration
+
cp /usr/local/etc/mpd5/mpd.secret.sample /usr/local/etc/mpd5/mpd.secret The format of mpd.secret is username password ip_address per line.
+
Example
+
+
1 2
fred "fred-pw" joe "foobar" 192.168.1.1
+
System configuration
+
sysctl net.inet.ip.forwarding=1
+
Pf configuraion
+
use NAT for internal private network.
+
skip the lo interface.
+
block adll traffic adn log all packet by default.
+
pass in tcp for port 1723 (PPTP)
+
pass in protocol gre
+
pass in from any to internal private network and vice versa.
+
Use the pfctl -f file to reload the pf instead of /etc/rc.d/pf restart, the latter will disconnect all exist connection.
1 2 3 4 5 6 7 8 9 10 11 12 13
my_int = "xn0" internal_net = "192.168.0.0/16" external_addr = "172.31.18.110" nat on $my_int from $internal_net to any -> $external_addr set skip on lo block inlog all pass in on $my_int proto tcp from any to any port 1723 keep state pass in on $my_int proto tcp from any to any port 443 keep state pass in quick on $my_int proto icmp all keep state pass in proto gre all keep state pass in from any to $internal_net pass in from $internal_net to any pass out proto { gre, tcp, udp, icmp } all keep state
本文介紹一個常見也是著名的演算法題目, N-Queue(N皇后),該題目假設在一個 NxN的棋盤上面,想求得總共有多少種的辦法去放置 N 的皇后且這些皇后彼此都不會互相影響,影響的定義則是上下左右直線,以及左下右上斜線總共六條斜線中都不能有其他的皇后。 這個經典問題有非常多種的解決方法,最簡單的就是透過遞迴配上剪枝來加速,然而在N過大的時候,盤面的數量太多,每次的運算都太花時間會導致整體求解速度過慢。因此本文使用基於 Bit 為單位的方式去進行運算,能夠更快速的求出答案。
首先,假設有一個以下的網路環境,我們在 Router 後面設置了兩台 機器,一台是 Web Server,另外一台則是一般的 PC。 由於該 PC 跟該 Web Server 都屬於同一個網域且都在 Router 底下,因此兩台機器之間若要透過 IP addresss 來傳輸基本上沒有太多問題。
+
+
但是外網的機器想要存取該 Web Server 的話,由於 Web Server 本身的 IP address 屬於 Private Network,譬如192.168.0.0/16這個範圍內。 因此外網的機器本身並沒有辦法直接存取到該 Web Server,但是若我們能夠將封包送到前面的 Router,再透過某種方式告訴 Router 說這個封包不是給你的,請幫我往下轉發給底下的 Web Server,則封包就可以很順利的到達 Web Server 去,一切的連線就順利完成。
+
上述行為裡面最重要的部分就是如何讓 Router 知道什麼樣的封包要送給底下的 Web Server,一般來說都會採用 DNAT (Destination NAT)的做法。Router 本身指定一個 Port Number,當看到封包是這個 Port 的時候,就會將封包轉送到底下的 Web Server,並且將封包內容修改讓 Web Server 能夠處理該封包。
+
舉例來說,假設我們在 Router 上面放一條 DNAT 的規則
1
1.2.3.4:8001 ---> 192.168.1.5:80
+
對於 Router 來說,當看到封包的 ip:port 是 1.2.3.4:8001,則會將封包標頭改成 192.168.1.5:80,然後依照本機端內的 route rules 將其轉發到底下的 Web Server 去。
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`iptables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`iptables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
check() { count=`ebtables-save | grep ctc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the ebtables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
+
Setup iptables
不同於 ebtables,在 iptables 這邊的修改比較多,原因如下
+
+
此情境屬於 Wan To Container, 這意味牽扯到不同網段的傳輸
+
因為我操作的環境算是很乾淨,所以我針對 Wan IP 以及 Container IP 來作為封包的條件
check() { count=`iptables-save | grep wtc| wc -l` if [ "$count" == "0" ]; then echo"Delete Success" else echo"Delete Fail, Use the iptables-save to check what rules still exist" fi }
if [ "$1" == "d" ]; then delete check else insert fi
+
Test
在測試方面,我一開始本來是採用 curl 的方式去連線 nginx 容器,但是其實 curl 做了太多事情了,除了一開始的 TCP 三方交握連線外,還包含了 HTTP GET。 對於我們只想要單純觀察 Wan To Controller 這來回的連線來說,這其實做了太多事情了。
If you want to avoid worry, do what Sir William Osler did: live in “DAY-TIGHT COMPARTMENTS”. Don’t stew about the future, just live each day until bedtime.
+
Rule 2
Try the magic formula of Willis H. Carrier:
+
+
Ask yourself, “What is the worst that can possibly happen if I can’t solve my problem?”
+
Prepare yourself mentally to accept the worst - if necessary.
+
The calmly try to improve upon the worst - which you have already mentally agreed to accept.
+
+
Rule 3
Remind yourself of the exorbitant price you can pay for worry in terms of your health. “Those who do not know how to fight worry die young”
+
+
Basic Techniques in Analyzing Worry
Rule 1
Get the facts. Remember that Dean Hawkes of Columbia University said that “Half the worry in the world is caused by people trying to make decisions before they have sufficient knowledge on which to base a decision”
+
Rule 2
After carefully weighing all t he facts, come to a decision.
+
Rule 3
Once a decision is carefully reached, Act! Get busy carrying out your decision and dismiss all anxiety about the outcome.
+
Rule 4
When you or any of your associates, are tempted to worry about a problem, write out and answer the following questions:
+
+
What is the problem?
+
What is the cause of the problem?
+
What are all possible solutions?
+
What is the best solution?
+
+
How to Break the Worry Habit Before It Breaks You
Rule 1
Crowd worry out of your mind by keeping busy, Plenty of actions is one the best therapies ever devised for chring “Wibber Gibbers”
+
Rule 2
Don’t fuss about trifles, Don’t permit little things. The mere termites of lift, to ruin your happiness.
+
Rule 3
Use the law of averages to outlaw your worries. Ask yourself, “waht are the odds against this things’s happening at all?”
+
Rule 4
Co-operate with the inevitable. If you know a circumstance is beyond your power to change or revise, say to yourself: “It is so, it cannot be otherwise”.
+
Rule 5
Put a “STOP-LOSS” order on your worries. Decide just how much anxiety a thing may be worth, and refust to give it anymore.
+
Rule 6
Let the past bury its dead, Don’t saw sawdust
+
Seven Ways to Cultivate a Mental Attitude That Will Bring You Peace and Happiness
Rule 1
Let’s fill our minds with thoughts of peace, courage, health, and hope, for “our life is what our thoughts make it”
+
Rule 2
Let’s never try to get even with our enemies, because if we do we will hurt ourselves far more than we hurt them. Let’s do as General Eisenhower does: Let’s never waste a minute thinkging about people we don’t like.
+
Rule 3
+
Instead of worrying about ingratitude, let’s expect it. Let’s remember that Jesus healed ten lepers in one day and only one thanked him. Why should we expect more gratitude than Jesus got.
+
Let’s rememberthat the only way to find happiness is not to expect gratitude, but to give for the joy of giving.
+
Let’s remember that gratitude is a “cultivated” trait; so if we wawnt our children to be grateful, we must train them to be grateful.
+
+
Rule 4
Count your blessing, not your troubles.
+
Rule 5
Let’s not imitate others, let’s find ourselves and be ourselves, for “envy is ignorance” and “imitation is suicide”
+
Rule 6
When fate hands us a lemon, let’s try to make a lemonade.
+
Rule 7
Let’s forget our own unhappiness, by trying to c reate a little happiness for other. “When you are good to others, you are best to yourself.”
+
#How to keep from worrying about criticism
+
Rule 1
Unhust criticism is often a disguised compliment. It often means that you have aroused jealousy and envy. Remember that nbo one ever kicks a dead dog.
+
Rule 2
Do thae very best you can; and then put up your old umbrella and keep the rain of criticism from running down the back of your neck.
+
Rule 3
Let’s keep a record of the fool things we have done and criticize ourselves. since we can’t hope to be perfect, let’s do that E.H.LITTLE DID: Let’s ask for unbiased, helpful, constructive criticism.
+
Six Ways to Prevent Fatigue and Worry and Keep Your Energy and Spirits High
Rule 1
Rest before you get tired.
+
Rule 2
Learn to relax at your work.
+
Rule 3
Learn to relax at home.
+
Rule 4
Apply These four good working habits.
+
+
Clear your desk of all papers except those relating to the immediate problem at hand.
+
Do things in the order of their importance.
+
When you face a problem, solve it then and there if you fave the facts necessary to make a decision.
+
Learn to organize, deputize, and supervise.
+
+
Rule 5
To prevent worry and fatigue, put enthusiasm into your work.
+
Rule 6
Remember, no one was ever killed by lack of sleep. Is is worrying about insomnia that does the damage, not the insomnia.
檢查destination的ip type = ethernet 且 ip version = ipv4且 ip header = 20byte (5*4)
+
+
1 2 3
ifnot packet.parsed: if packet.type == ethernet.LLDP_TYPE: return CONTINUE
+
1 2 3 4 5 6 7 8 9 10 11 12 13
try: dst_mac = (struct.unpack('!I', packet.arr[0:4])[0] << 16) + struct.unpack('!H', packet.arr[4:6])[0] if dst_mac in self.mac_bypass: return CONTINUE
type = struct.unpack('!H', packet.arr[12:14])[0] ipver = struct.unpack('!b', packet.arr[14:15])[0] if type == 0x800 and ipver == 0x45: dst_ip = struct.unpack('!I', packet.arr[30:34])[0] if dst_ip in self.ip_bypass: return CONTINUE except: pass
git clone -b onos-1.11 https://gerrit.opencord.org/quagga cd quagga ./bootstrap.sh ./configure --enable-fpm --sbindir=/usr/lib/quagga make sudo make install cd ..
spinlock_t lock; /* Lock for values below. */ unsignedlong used; /* Last used time (in jiffies). */ u64 packet_count; /* Number of packets matched. */ u64 byte_count; /* Number of bytes matched. */ u8 tcp_flags; /* Union of seen TCP flags. */ };
+
+
This struct store the info of each flow, including count, flow_key and flow_mask.
+
+
ovs_vport_init
vport.c”
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/** * ovs_vport_init - initialize vport subsystem * * Called at module load time to initialize the vport subsystem. */ intovs_vport_init(void) { dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM;
for (i = 0; i < ARRAY_SIZE(base_dpif_classes); i++) { dp_register_provider(base_dpif_classes[i]); } ovsthread_once_done(&once); } }
+
+
這邊會去註冊每個 base_dipf_classes.
+
+
1 2 3 4 5 6 7 8 9 10 11 12 13
structdpif_class { .... /* Adds 'netdev' as a new port in 'dpif'. If '*port_no' is not * UINT32_MAX, attempts to use that as the port's port number. * * If port is successfully added, sets '*port_no' to the new port's * port number. Returns EBUSY if caller attempted to choose a port * number, and it was in use. */ int (*port_add)(struct dpif *dpif, struct netdev *netdev, odp_port_t *port_no); ...
若是在多 VM 的環境下,這時候有 balance-slb 與 balance-tcp 可以考慮(假設想要 speed up),這兩個主要考慮的點在於 使用 balance-slb 的話,會讓同一個 VM 的所有流量都走同一個 interface 出去,所以若當前其他 VM 都閒置的情況下,該 VM 還是只能用到一條 link 的資源。 若採用 balance-tcp 的話,則會依照 connction 來分,所以不論何種情況都能夠盡量使用每條 link 的資源
+
+
其他
+
流量的觀察方式是在 linux PC 上面透過 /proc/net/dev 週期性觀察兩張 slave interface 的 TX/RX counter計算得知,週期性為一秒。
This post shoes about what the system do when we install the OpenvSwitch in your system. The architecture of OpenvSwitch covers both user-space and kernel-space and we can see functions of each part in this porsts.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Environment
+
Three PCs.
+
One for openvswitch (with a 4-port ethernet card).
+
Two for hosts.
+
OVS version 1.9
+
+
+
+
kernel module
insmod datapath/linux/openvswitch.ko
+
When we load the openvswitch‘s kernel module, it will register four generic netlink event including datapath, vport, flow and packet.
+
+
In the datapath.c, we can see those four generic netlink type.
+
+
Take the vport for example, there’re four command we can excute via this netlink type. If we want the kernel to create a new port, we can send the vport type netlink with the command OVS_VPORT_CMD_NEW , and the command handler (doit) ovs_vport_cmd_new will be excuted to create the new vport.
+
+
ovs-vswitchd
ovsdb-server ... ovs-vswitchd --pidfile --detach
+
+
First, the ovsdb-server will start a database daemon, In addition, there’re some user-space tool will work with it, like ove-vsctl, ovs-ofctl..etc.
+
The user-space process ovs-vswitchd play a importmant role about openflow in OpenvSwitch. It will parse the openflow protocol and handle it (you can use the keyword ofproto to find the resource about it)
+
+
ovs-vswitchd:
+
+
Process openflow messages
+
Manage the datapath (which actually in kernel space)
+
Maintain two flow table (exactly flow & wildcard flow)
+
+
+
Adding bridge
ovs-vsctl add-br br0
+
When we excute the ovs-vsctl, it will send a command to ovsdb and the DB will store this information. After that, the ovsdb will pass the command to ovs-vswitchd, and the ovs-vswitch send the netlink with datapath type to the kernel. Since we have installed the kernel module before, the datapath will receive the netlink and excute the corresponding command handler. In this case, it will excute ovs_dp_cmd_new. Finally, the datapath will be created and it will be managed by ovs-vswitchd.
+
datapath:
+
+
Maintain one flow table (exactly flow) This study is based on the OVS v1.9
+
Act as the software switch (look up flow, forward the packet)
+
+
+
Adding vports
ovs-vsctl add-port br0 eth1
+
Like the above discussion about datapath, ovs-vswitchd send the netlink to the kernel. In the command handler ovs_vport_cmd_new.
+
1.Find the the struct net_device object in the kernel by the user typing interface name (eth1) 2.Modify the receive_handler of that net_device to the OpenvSwitch’s packet handler.
Set the controller setting and it will be done in ovs-vswitchd.
+
+
+
##In the following example, we use a simple case to explain how the ping works
+
Target command
hostA ping hostB
+
We devide the picture into two parts by the red line.
+
Upper Part
+
+
This part show the physical view of thie case.
+
The middle PC has installed the Ubuntu 12.04 and OVS 1.9.
+
The left PC connect to the OVS’s nic eth1
+
The right PC connect to the OVS’s nic eth2
+
+
Lower Part
+
+
This part show the system view of the switch PC (middle one)
+
We use the dash-line to separate the user-space and kernel-space.
+
+
Analysis
+
After the OVS receives the ICMP packet from the left PC. What will happen about OVS?
+
+
+
The NIC eth1 receives the ICMP packet.
+
Call the receive_handler to handler this ICMP packet.
+
Do flow_lookup, it will look up the flow table maintained by the kernel-space. All the flow entry in this table is exactly flow entry, which means there’re no any wildcard. This architecture will speed up the look-up since we don’t need to consider the wildcard field. In the OpenvSwtich, it use the struct sw_flow_key to present a exactly flow.
+
If we find the flow entry in the flow table, excute its flow actiojn.
+
Otherwise, we need the help from controller. so the datapath.ko will send this flow to the user-space via the f unction upcall (actually, it’s a netlink message)
+
+
What will happen when the ovs-vswitch receive the flow from the kernel-space.
+
+
+
Both exactly matching flow and wildcard matching flow are stored in the user-space (by Openflow protocol).
+
Since the exactly matching has high priority than wildcard matching, we need to lookup the exactly macthing flow table first.
+
Look up the flow entry in the user-space by exactly matching, if we find it, send two netlink message to the kernel (we will discuss these two nelitnkj message later)
+
Otherwise, look up the flow entry by wildcard matching, if we find it, generate a corrsponding exactly flow entry and send two netlink message to the kernel.
+
If we can’t find any flow entry in the flow-table, we issue a Packet_In to the controller.
+
+
After the kernel-space receive those two netlink message which sending from user-space.
+
+
Excute the flow_actiojn about that flow entry.
+
Insert that exactly mactching flow into the kernel’s flow-table. That will create the cache for that connection and crease the processing time for nect packets.
+
+
+
Summary
+
+
There is a limitation about the size of flow table in kernel, it use the cache (exactly macthing) to speed up the look-up.For the recently activity connection, those packets can be handled quickly.
+
The flow-table in the user-space is the same as the what the controller see. It support the wildcard matching. We can reduce the size of flow entries by wildcard matching but it will bring the overhaed for look-up
+
+
MISC
+
+
You can use the ovs-dpctl dump-flows to dump the flow table of kernel-space
+
This article is based on the OVS v1.9 and the architecture has some change after v1.11
/* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the bottom of vport.h. */ staticconststructvport_ops *vport_ops_list[] = { &ovs_netdev_vport_ops, &ovs_internal_vport_ops, #if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) &ovs_gre_vport_ops, &ovs_gre64_vport_ops, #endif &ovs_vxlan_vport_ops, &ovs_lisp_vport_ops, }; ```
staticvoidnetdev_port_receive(struct vport *vport, struct sk_buff *skb) { if (unlikely(!vport)) goto error; if (unlikely(skb_warn_if_lro(skb))) goto error; /* Make our own copy of the packet. Otherwise we will mangle the * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). * (No one comes after us, since we tell handle_bridge() that we took * the packet.) */ skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return;
/* Link layer. We are guaranteed to have at least the 14 byte Ethernet * header in the linear data area. */ eth = eth_hdr(skb); memcpy(key->eth.src, eth->h_source, ETH_ALEN); memcpy(key->eth.dst, eth->h_dest, ETH_ALEN);
__skb_pull(skb, 2 * ETH_ALEN); /* We are going to push all headers that we pull, so no need to * update skb->csum here. */
if (vlan_tx_tag_present(skb)) key->eth.tci = htons(vlan_get_tci(skb)); elseif (eth->h_proto == htons(ETH_P_8021Q)) if (unlikely(parse_vlan(skb, key))) return -ENOMEM;
key->eth.type = parse_ethertype(skb); if (unlikely(key->eth.type == htons(0))) return -ENOMEM;
/* The memory outside of the 'mask->range' are not set since * further operations on 'dst' only uses contents within * 'mask->range'. */ for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) *d++ = *s++ & *m++; }
/* Execute a list of actions against 'skb'. */ intovs_execute_actions(struct datapath *dp, struct sk_buff *skb) { structsw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); structloop_counter *loop; int error;
/* Check whether we've looped too much. */ loop = &__get_cpu_var(loop_counters); if (unlikely(++loop->count > MAX_LOOPS)) loop->looping = true; if (unlikely(loop->looping)) { error = loop_suppress(dp, acts); kfree_skb(skb); goto out_loop; }
staticintdo_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct nlattr *attr, int len, bool keep_skb) { /* Every output action needs a separate clone of 'skb', but the common * case is just a single output action, so that doing a clone and * then freeing the original skbuff is wasteful. So the following code * is slightly obscure just to avoid that. */ int prev_port = -1; conststructnlattr *a; int rem;
for (a = attr, rem = len; rem > 0; a = nla_next(a, &rem)) { int err = 0;
+Restrictions: +------------- + + - This Support is for Physical NIC. I have tested with Intel NIC only. + - vswitchd userspace datapath does affine polling thread but it is + assumed that devices are on numa node 0. Therefore if device is + attached to non zero numa node switching performance would be + suboptimal. + - There are fixed number of polling thread and fixed number of per + device queues configured. + - Work with 1500 MTU, needs few changes in DPDK lib to fix this issue. + - Currently DPDK port does not make use any offload functionality.
+
其中的 Currently DPDK port does not make use any offload 其中的段話讓我滿好奇的,但是在最新 OVS 2.8 中該敘述也已經不見了, 可能此限制也已經排除。所以我們為什麼會遇到這個問題,暫時還沒有頭緒,等有時間時再來細追看看,不然就先去 ovs-dicuss 那邊發問一下好了。
+
+
+
+
+
+
+ This post shows a way to install the cscope into your sublime text3 and then you can use the cscope for your existing programming project.
+
+
+
+
+
+
+
+
+
+ This post shoes about what the system do when we install the OpenvSwitch in your system. The architecture of OpenvSwitch covers both user-space and kernel-space and we can see functions of each part in this porsts.
+
+
+
/* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the bottom of vport.h. */ staticconststructvport_ops *vport_ops_list[] = { &ovs_netdev_vport_ops, &ovs_internal_vport_ops, #if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) &ovs_gre_vport_ops, &ovs_gre64_vport_ops, #endif &ovs_vxlan_vport_ops, &ovs_lisp_vport_ops, }; ```
str.translate(table[, deletechars]);
+Parameters
+table -- You can use the maketrans() helper function in the string module to create a translation table.
+
+deletechars -- The list of characters to be removed from the source string.
+
str.maketrans(intab, outtab]);
+Parameters
+intab -- This is the string having actual characters.
+
+outtab -- This is the string having corresponding mapping character.
+
+
+
+
+
+
+
+ To setup a kubernetes, there're so many approaches, including what infrstructure(VM/Container/Native) about the kubernetes and what tooles(kubeadm,kubespray,minikube). I choose use the kubeadm to install a native kubernetes in my laptop(Fedora) and you will see the whole setps to install all requirements, including the docker/kubernetes.
+
+
+
+
+
+
+
+
+
+ 於 kubernetes 叢集中,我們會部屬大量的容器應用程式,而這些應用程式有些是面向使用者,也有些是彼此容器間互相溝通使用的.舉例來說,管理人員可能會在叢集中佈署了相關的資料庫容器服務,而其他的應用服務就必須要透過網路連線的方式來存取該資料庫.為了要透過網路的方式存取,就意味要使用 IP 地址的方式來存取對應的容器服務,然而在 kubernetes 的叢集中,這些 Pod 重啟後預設情況下都會拿到不相同的 IP 地址, 這意味我們的客戶端應用程式就沒有辦法寫死這些 IP 地址來存取,必須要有更動態的方式讓客戶端應用程式可以取得當前目標容器(資料庫)的最新 IP 地址. 為了解決這個問題, 我們可以透過 kubernetes service 的架構來達成上述的目的。本文會跟大家介紹什麼是 kubernetes service 以及透過實際範例介紹該如何使用
+
+
+
+
+
+
+
+
+
+ DNS 在傳統的網路架構中一直扮演很重要的角色,可以讓用戶端透過 FQDN 的方式去存取目標的伺服器端,不需要去寫死對方的 IP 地址。然而在 kubernetes 的架構中, kubernetes 預設就會建立一套 DNS server 讓所有創建的 Pod 來使用。對於一般的使用者來說,只要能夠存取外面網路以及 Kubernetes Service 相關即可,然而在某些特殊的應用情境下,譬如目前喊行之有年的 NFV 架構中,我們的服務(Pod)本身可能會需要更複雜的網路架構,譬如同時存在 Data Network/Control Network. 這情況下,我們的 Pod 會需要特別去處理自己裡面服務所使用的 DNS Server 。 本文主要針對 Pod 裡面關於 DNS 相關的設定進行介紹,並且透過實際部屬 Yaml 的方式來看看到底如何使用。
+
+
+
+
+
+
+
+
+
+ As we know, the kubernetes use the CNI to provide the network connectivity for its Pod unit and the cluster administrator can choose what kind of the CNI should be installed in the cluster. For example, if the only requirement is the overlay network, you can choose the flannel CNI and choose the calico CNI if you have the requirement of the BGP. In this post, we will learn how to write your own CNI in golang language. Actually, You can implement it with any language as you like.
+
+
+
This slide first introduces the limitation of the current Linux network stack, including how the kernel receives packets from the network interface card.
+
And then it introduces two high perforamce network mechanism, the DPDK and RDMA. the concept of those mechanisms and also how do they work.
+
In the end, introduce a paper about how to use the RDMA to accelerate the training time in the Tensorflow framework.
This slides introduce a basic concept of how docker pulls images from docker hub and private registry first and then assumes a simple scenario which do the CI/CD for applications in the kubernetes cluster.
+
In that scenario, you will use a kubernetes pod to build your application into a docker image and push that to the private registry and then pull that image to run as another kubernetes Pod in kubernetes cluster.
+
The slides shows every problems you will meet, including how do you deploy your private registry, the network connectivity between the k8s Node/k8s Pod and private registry and a trusted SSL certificate.
+
In the last section, the author provides a simple solution for that scenario and use a simple graph to explain how to works.
+
+
+
+
+
+
+ Container Network Interface (CNI) as a Network Interface between the network soluition and the container mechanism. Without the CNI, the network solution developer should implement his/her plugin for every container environment and it must be a disaster. Fortunately, with the help of the CNI, the developer can only focus on one interface and it should work for every container mechanism. In this post, we will see why we need the CNI, what is CNI and how kubernetes use the CNI to provide the network connectiviy for the computing unit, so called Pod.
+
+
+
+
+
+
+
+
+
+ The most import feature of the container is the resource isolation, including the mount, network, user, UTC and PID. that's the reason why we can't see those resources status of the host. The resources isolation are supported by the Linux Kernel and we will demostrate the networking part by the network namespace and also show you how does the docekr use the network namespace and Linux Bridge to proivde the network connectivity for each container.
+
+
+
If you want to avoid worry, do what Sir William Osler did: live in “DAY-TIGHT COMPARTMENTS”. Don’t stew about the future, just live each day until bedtime.
+
Rule 2
Try the magic formula of Willis H. Carrier:
+
+
Ask yourself, “What is the worst that can possibly happen if I can’t solve my problem?”
+
Prepare yourself mentally to accept the worst - if necessary.
+
The calmly try to improve upon the worst - which you have already mentally agreed to accept.
+
+
Rule 3
Remind yourself of the exorbitant price you can pay for worry in terms of your health. “Those who do not know how to fight worry die young”
blktrace is a block layer IO tracing mechanism which provide detailed information about request queue operations up to user space.
+
blkparse will combine streams of events for various devices on various CPUs, and produce a formatted output the the event information. It take the output of above tool blktrace and convert those information into fency readable form.
+
In the following, We will use those tools blktrace and blkparse to help us to observe sector numbers which has been written by fio requests. We will use the fil to generate two diffenrt IO pattern requests, sequence write and random write.
In this article, I will write down a example to descrip how to setup the LXR(Linux Cross Reference) with multiple projects support. In this configuration, your can view multiple projects’ refernce in one LXR service. For this article, I use the Ceph and DRBD for my LXR projtects.
RDMA (Remote Direct Memory Access) is a mechanism which allow the host to accessing(read, write) memory on a remote host without interrupting CPU.
+
The advantage of RDMA
+
Zero-copy
+
Kernel bypass
+
No CPU involvement`
+
+
+
+
+
With RDMA, our data can transfer without the involvement of the linux kernel network stack and provide hight performance, low latency, low CPU consumption.
+
This article focus on how to enable the ceph with RDMA, including how to install ceph and enable the RDMA function.
BlueZ is official Linux Bluetooth protocol stack. It is an Open Source project distributed under GNU General Public License (GPL). BlueZ kernel is part of the official Linux kernel since version 2.4.6.
+
+
+
+
+
+
+ VPN server is a very useful tool for your network connectivity, although there're many online VPN service around the world, it's slow speed and money cost and you can't sure they won't collect your connection data. That's why sometimes we want to build the VPN server by ourself and this porst introduce a way to setup a VPN server in your FreeBSD server.
+
+
+
Google本身的服務,譬如 Gmail, Google Search本身都含有一組或是多組以上的 IP address,而這些 IP address 被稱為 Virtual IP(VIP)。原因是這些 IP 本身是不存在於任何實體網卡上,只是讓網際網路中的路由器能夠根據這些 VIP 將這些封包給導向到 Google 的服務器之中。接下來這些封包就會傳送到 Maglev這群服務器中去處理,再根據VIP找到對應的服務,然後把封包傳送給真正的服務器去處理。
+
假設以 Gmail 為範例,當使用者要連結到 gmail.com的時候,會先到 DNS 本身去詢問 gmail.com所對應的 IP address。 而這些 DNS 回應的 IP 對於 google 來說其實是 VIP,然後使用者的電腦都會嘗試將請求的封包送到 VIP 所對應的路由器去處理。 當 Router 收到這個 VIP 的封包後,接下來他要把這個封包送到底下的 Maglev 服務器群去處理,在 Router -> Maglev 的過程中採用了 Equal Cost Multi Path(ECMP)的方式去傳送封包,盡可能的讓這些 Maglev服務器能夠平均的收到請求封包,這邊在我看來也是一種簡單的負載平衡的功能,不過著重的對象是Maglev而不是背後真正服務的服務器。
本文屬於論文導讀系列,這次針對的是SIGCOMM 2017所發表的論文中關於Data Center架構的論文。SIGCOMM這個 Conference裡面都有很多跟網路相關且高品質的論文,除了學界之外,也常常有很多業界會將相關的研究與產品設計投稿於此,因此是個滿好學習網路概念的一個資源。本篇文章針對的主題是 Re-architecting datacenter networks and stacks for low latency and high performance, 該文主旨希望重新打造一個有真正高傳輸效能的資料中心,其中涉及了非常多的面相,從交換機的實現到上層 TCP 協定的修正,從諸多面向來探討傳統的諸多協定為什麼沒有辦法達到真正的高效能傳輸,該論文非常精彩,可以學習到非常多的概念與知識,非常歡迎閱讀。
本論文所發生的場景是在 Data Center內,不適用於一般的網際網路,主要是因為網際網路充滿太多未知性與無法掌控的裝置,譬如每條 Link 的頻寬, Switch 的設定等。有這些未知的情況下,無法設計一個好的運作模式來滿足 Low Latency 與 High Throughut 的需求。因此環境都只考慮 Data Center。
+
本論文提出了 NDP (New Datacenter Protocol Architecture) 這個概念,其具體目標,評比對象以及實作方法如下。
+
本文希望能夠達到的需求有
+
+
對於 short-flow 能夠有盡量低的 latency
+
對於 long-flow 能夠有盡量高的 throughput
+
對於一個高度負載的網路中,能夠充分利用整個網路拓墣上的 Link 來傳輸封包 (high network capacity)
+
能夠將 incast 情況對網路造成的影響降到最小
+
+
本文評比對象有
+
+
DCTCP (Data Center TCP)
+
MPTCP (Multi Path TCP)
+
DCQCN (Data Center QCN)
+
DCQCN is a congestion control protocol for large scale RDMA networks, developed jointly by Microsoft and Mellanox.
近年來 Data Center 蓬勃發展,為了能夠在內部提供更良好的網路效能,不論是低延遲或是高產出,網路架構從早期的三層式架構(Fat tree)逐漸都轉換成 Clos Network,然而傳統的 TCP 協定在設計上並不是針對 Data Cetner來設計的,因此其設計原理導致其不能滿足需求。 因此本論文提出了 NDP (new data-center transport architecture),其能夠提供 short transfer接近完美的傳輸時間,同時對於廣泛的應用情況,如 incast 下亦能夠提供很高的傳輸效能。 NDP架構下, switch 採用非常小的 buffer來存放封包,當 buffer 滿載時,不同於傳統的將封包丟掉,NDP採取的是截斷封包的內容(payload),只保留該封包的標頭檔,並且將此封包設定為高優度優先轉送。這種作法讓收端能夠有更完整關於送端的資訊,能夠動態的調整傳送速率。 本論文將 NDP 的概念實作於 Software switch, Hardware Switch 以及一般的 NDP application。這中間使用的技術與硬體分別包含了 DPDK、P4 以及 NetFPGA。 最後本論文進行了一系列效能評比,於大規模的模擬中證明了NDP能夠為整個 data center 提供低延遲與高輸出的特性。
+
Introduction
隨者 Data Center 爆炸性的成長,為了滿足其內部傳輸的需求(低延遲/高輸出),有各式各樣的新方法或是舊有技術的改進用來滿足上述需求。譬如 TCP 改進的 DCTPC、本來就存在已久的 MPTCP 甚至是 RDMA (RoCE)。 若對 RDMA/RoCE 想要更瞭解,可以參考下列連結。RDMA Introduction (一) 再 RDMA 的網路環境中,為了減少封包遺失對整體傳輸造成的傳輸,都會希望能夠將整個網路傳輸打造成 lossless 的環境,為了達成這個方法,可以採用 Ethernet Flow Control, Explicit Congestion Notification(ECN) 或是 Priority Flow Control(PFC)。然而在 SIGCOMM 2016 微軟發了一篇 paper 再講 RDMA + PFC 的問題。其表明雖然 PFC 可以用來控制傳送封包的速率,藉此達到 lossless 的網路環境,但是一旦整體網路處於高度壅塞的情況時,資料封包與 PFC 控制封包的爭奪會使得整體網路沒有辦法繼續提供低延遲的優點,最後給出了一個結論 “how to achieve low network latency and high network throughput at the same time for RDMA is still an open problem.“
Data Center 不同於網際網路的地方在於網路拓樸中的每個 Switch/Link 都是自己掌握的。 所以 TCP 採用的 Slow Start 其實對於 Data Center 來說是沒有效率的,畢竟一開始就可以知道可用頻寬多少,不需要如同面對網際網路般的悲觀,慢慢地調整 Window Size,而是可以一開始就樂觀的傳送最大單位,在根據狀況進行微調。 若採取這種機制,則應用程式可以使用更快的速度去傳送封包。
假設我們已經擁有了 Zero RTT 以及 Per-packet ECMP 兩種特性,擇一條新連線的封包可能就會發生 Out of order 的情況,收送順序不同的狀況下,若對於 TCP 來說,就會觸發壅塞控制的機制進而導致降速。 因此 NDP 在設計時,必須要能夠處理這種狀況,可以在不依賴封包到達先後順序下去處理。
+
Optimized for Incast
即使整個 data center 的網路環境,如頻寬等資訊一開始就已經可以掌握, incast 的問題還是難以處理,畢竟網路中變化太多,也許有某些應用程式就突然同時大量產生封包,這些封包同時間到達 switch 就可能導致封包被丟棄。 因此 NDP 本身在設計時,也希望能夠解決這個問題。
+
Switch Service Model
在一個 data center內,除了應用程式特定,傳輸層協議(Transport Protocol)之外,還有一個性質也是很重要的,這個性質與上述兩個性質關係緊密,一起影響整個網路的運作,這個性質就是 Switch Service Model。 作者認為這性質中,最重要的就是當 switch port 發生阻塞時會怎麼處理,這個性質會完全影響到傳輸層協定以及壅塞控制演算法(Congestion control algorithms)的設計,甚至是傳輸相關的概念,如 per-packet ECMP 都會被影響到。 作者提到,目前有很多種預防壅塞機制,譬如 Loss as a congestion,Duplicate or selective Acks等,其中 Duplicate or selective Acks 會主動去觸發重送,這種技巧對於長時間連線來說是好的,但是對於需要低延遲的短暫連線來說是不好的,主要是這些重送都會經過一個 RTO(Retransmission timeouts),這個時間的長短都會產生一些負面的影響,因此也不是一個萬用的解法。
有一篇paper在講述上述的問題 C. Raiciu, S. Barre, C. Pluntke, A. Greenhalgh, D. Wischik, and M. Handley. Improving datacenter performance and robustness with Multipath TCP. In Proc. ACM SIGCOMM, Aug. 2011.
本文屬於論文導讀系列,這次針對的是高速網路(RDMA)的應用,來源是 SICCOM 2017 會議上。這篇文章有趣的地方在於他不是單純的介紹架構,而是透過一個實際的應用程式來闡述當該應用程式搭配上 RDMA 後獲得了 Zero Copy 的特色,在此特色加持下,原先應用程式的效能提升了多少。本文的標題是 Towards Zero Copy Dataflows using RDMA, 其內容跟 AI 的訓練過程有關,採用了由 Google 開源的訓練框架, Ternsorflow, 並且分析了在原先分散式的訓練模型中,資料不論在 CPU/GPU UserSpace/KernelSpace 甚至節點間都有大量的資料複製行為。透過 RDMA 的幫忙減少了這些行為最後證明了整體分散式訓練的時間大幅度縮短,是個非常有趣的短文.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Preface
2017 SICCOM 上面出現了一篇令人感到有趣的論文,內容是 Towards Zero Copy Dataflows using RDMA,可以到 這邊 閱讀該篇論文內容。
Remote Direct Memory Access (RDMA) offers ultra-low latency and CPU bypass networking to application programmers. Existing applications are often designed around socket based software stack that manages application buffers separately from networking buffers and do memory copies between them when sending/receiving data.
With large sized (up to hundreds MB) application buffers, the cost of such copies adds non trivial overhead to the end-to-end communication pipeline.
+
+
這篇就是問題的重點,大檔案資料在複製過程中產生的負擔,也是本篇論文想要解決的問題。
+
+
In this work, we made an attempt to design a zero copy transport for distribute dataflow frameworks that unifies application and networking buffer management and completely eliminates unnecessary memory copies.
+
+
這裡提到,這篇論文怎麼解決上述問題,而且直接瞭當的說明其方法就是,zero copy transport for distrubute dataflow frameworks.
+
+
Our prototype on top of TensorFlow shows 2.43x performance improvement over gRPC based transport and 1.21x performance improvement over an alternative RDMA transport with private buffers and memory copies.
str.translate(table[, deletechars]);
+Parameters
+table -- You can use the maketrans() helper function in the string module to create a translation table.
+
+deletechars -- The list of characters to be removed from the source string.
+
str.maketrans(intab, outtab]);
+Parameters
+intab -- This is the string having actual characters.
+
+outtab -- This is the string having corresponding mapping character.
+
上述已經弄好了 switch 後, 我們接下來要創立兩個獨立的網路空間 network namespace,這邊使用 ip netns 指令來幫我們達成。 先使用 ip netns help 來看看有那些指令可以使用。
+
1 2 3 4 5 6 7 8
>ip netns help Usage: ip netns list ip netns add NAME ip netns delete NAME ip netns identify PID ip netns pids NAME ip netns exec NAME cmd ... ip netns monitor
執行完下列指令後,可以透過 ifconfig 或是 ip link 看到剛創造出來的 interface
+
1 2 3 4 5 6
ip link add name vet-n1 type veth peer name ovs-1 ip link add name vet-n2 type veth peer name ovs-2 ifconfig vet-n1 up ifconfig vet-n2 up ifconfig ovs-1 up ifconfig ovs-2 up
+
到這一步驟後,整個系統架構如下圖,已經有點樣子了,離目標只差一點點了。
+
Step4
經過前述的所有準備,該有的東西都有了,剩下的就是將上述創建的 interface 給放到正確的地方上,並且配上一個相同網域的 ip address,就可以讓 openvswitch 以 l2 briding 的方式把封包給轉發了。
+
這邊我們要繼續 ip 指令,首先我們要將剛剛創建的 vet-n1/vet-n2 這兩張 interface 給丟到 ns1/ns2 裡面,指令如下。
+
ip link set $interface netns $ns, 套到我們的環境的話,就是
+
1 2
ip link set vet-n1 netns ns1 ip link set vet-n2 netns ns2
+
當執行完這些指令後,再度透過 ip link 你會發現 vet-n1/vet-n2 這兩張 interface 完全消失了,已經被從 Ubuntu Host 本身給搬移到上述創造好的 network namespacen1/n2 裡面了。
+
接下來我們使用 ip netns exec 指令進入到 ns1/ns2 裡面去設定我們的網路了。 我們有下列事情要做
+
+
將剛剛獲得到的 vet-n1/vet-n2 改名成 eth0 (為了好看)
+
將 eth0 以及 lo 叫起來
+
幫 eth0 設定 ip 及網段。
+
+
所以指令大概如下
+
1 2 3 4 5 6
> ip netns exec ns1 bash > ip link set vet-n1 name eth0 > ip addr add 10.0.0.101/24 dev eth0 > ip link set eth0 up > ip link set lo up > exit
+
上述的指令會將 ns1 相關的事情都處理完畢,這時候再針對 ns2 進行一樣的處理,唯一記得的是 ip 的部分記得不要重複即可。 一切完畢後,目前系統上的架構如下圖
This slide first introduces the limitation of the current Linux network stack, including how the kernel receives packets from the network interface card.
+
And then it introduces two high perforamce network mechanism, the DPDK and RDMA. the concept of those mechanisms and also how do they work.
+
In the end, introduce a paper about how to use the RDMA to accelerate the training time in the Tensorflow framework.
This slides introduce a basic concept of how docker pulls images from docker hub and private registry first and then assumes a simple scenario which do the CI/CD for applications in the kubernetes cluster.
+
In that scenario, you will use a kubernetes pod to build your application into a docker image and push that to the private registry and then pull that image to run as another kubernetes Pod in kubernetes cluster.
+
The slides shows every problems you will meet, including how do you deploy your private registry, the network connectivity between the k8s Node/k8s Pod and private registry and a trusted SSL certificate.
+
In the last section, the author provides a simple solution for that scenario and use a simple graph to explain how to works.
if(msg->msg_name) { structsockaddr_in *usin = (structsockaddr_in *)msg->msg_name; if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; if (dport == 0) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; }
NCurses Disk Usage(ncdu) is a powerful tool to view file sizes across different directories in a simple and friendly GUI. Besides, you can also do some operation but read, such as delete file/directory. In this post, I will introduce what is ncdu and how to use it to replace the legacy command du.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Preface
We usually use the command df to see the current disk capacitry/size of each mount point and use the command du to see the file size under current directory. In the command du, we can specify options to limit the depth level of the file directory and hence, get the total size of the directory. However, it’s not convenient for the administrator to view the file size across different directory, you need to execute the command du many times and use other method to record the current result for later use.
+
Fortunately. We have the Ncurses DIsk Usage(ncdu) and it provides a friendly GUI for administrator to manage the files/directories and you can find more detail about it in its official website
+
Install
You can download the source code from the official website, compile it and then intall to your system.
+
If you’re prfer to download the pre-configured software from some package system, you can use the following command to install the ncdu but it’s depend on what package management system you use.
+
MacOS (Brew)
brew install ncdu
+
Ubuntu (apt system)
apt-get install ncdu
+
Usage
Change to any directory you want to inspect the file size and then exectue the commnd ncdu.
+
First, ncdu will recursively collect the file information under the current directory. It will takes a time depends on how big of your directory structure.
+
You will see the following picture in your terminal.
+
After the collection has done, it will display the size of each file and directory(total size if it’s directory) under the current directory.
+
Format
The output forwat is clear.
+
+
First colume:
+
The file size and it will automatically transfer to humand readble size.
+
+
+
Second colume
+
the percentage of specified file/directory to the whole root directory, it use the numder of sharp symbol to show the ratio by default and you can toggle the shortcut g to display by numbrer.
+
+
+
Third colume
+
The file/directory name
+
+
+
+
+
Operation
Navagation
The basic operation is navagation, use the arrow key(up/down) or k/j to move the cursoe up and down respectively.
+
Open
The amazing feature I think better than the legacy command du is nctu supports the open opeartion. You can use the to arrow key(right/left) to open the directory and use it as the root directory or go back to the previous root directory. With the help of this feature, we don’t need to execute the command du many times to see the whole inforatiom.
+
Delete
Besides, ncdu also provides the delete option to let your delete the file or whole directory in the current window.
+
You can see the instruction help by the key ?.
+
+
Summary
I used to use the command du to inspect the current file/directory size and also use the command sort to sort the result by the du command. There’re some problem about that usage and bothered me for a long time. If command du shows the output with human-readble foramt, it’s hard for sorting, but if it shows the size as numeric format, it’s good for sorting but not for reading. In the ncdu, that problem doesn’t exsit and the ncdu also support the delete operation and the way to change the current root directory. That’s why I switch to use the ncdu once I had found this powerful tool.
本篇開頭提到,有些應用程式會需要與 Kubernetes 有緊密的連結操作,而本次展示的專案則使用了 Client-go 這個套件開發了一個很簡單能夠自行產生 Pod 的應用程式。 並且為該功能撰寫了一個簡單的測試,該測試會透過該函式嘗試去產生 Kubernetes Pod 並且確保該 Pod 有成功產生,最後將其刪除。
before_install runs before the install step, which is meant to install any required packages or dependencies. You can prepare things before you run this step, or you can e.g. run sudo apt-get update to refresh the apt indexes.
before_script runs before the actual test/build script runs. It’s commonly used to run any preparation steps required to get the build running, for instance copy database configurations, set up any additional environment configuration, and so on.
set encoding=utf-8 set fileencodings=ucs-bom,utf-8,big5,latin1 set fileencoding=utf-8 set termencoding=utf-8 set number " 行號 set statusline=%<\ %n:%f\ %m%r%y%=%-35.(line:\ %l\ of\ %L,\ col:\ %c%V\ (%P)%) set ai " 自動縮排 syntax on " 色彩標示
set tabstop=4 " tab使用四個空白取代 set shiftwidth=4 " 縮排空白數,要搭配set cin使用 set cin set cursorline " 該行的線 set t_Co=256 " 支援 256 色 set textwidth=0 set backspace=2 "按下backspace會後退,道行首後會刪除到前一行 set showmatch "顯示括號配對情況 set nocompatible "用vim的特性去運行,捨棄vi的特性
" Pathogen call pathogen#infect() call pathogen#helptags() filetype plugin indent on " Nerdtree autocmd VimEnter * NERDTree autocmd VimEnter * wincmd p let NERDTreeShowBookmarks=1 let NERDTreeChDirMode=0 let NERDTreeQuitOnOpen=0 let NERDTreeMouseMode=2 let NERDTreeShowHidden=1 let NERDTreeIgnore=['\.pyc','\~$','\.swo$','\.swp$','\.git','\.hg','\.svn','\.bzr'] let NERDTreeKeepTreeInNewTab=1 let g:nerdtree_tabs_open_on_gui_startup=0
private void CreateGraph( ZedGraphControl zgc ) { // get a reference to the GraphPane GraphPane myPane = zgc.GraphPane;
// Set the Titles myPane.Title.Text = "My Test Graph\n(For CodeProject Sample)"; myPane.XAxis.Title.Text = "My X Axis"; myPane.YAxis.Title.Text = "My Y Axis";
// Make up some data arrays based on the Sine function double x, y1, y2; PointPairList list1 = new PointPairList(); PointPairList list2 = new PointPairList(); for ( int i = 0; i < 36; i++ ) { x = (double)i + 5; y1 = 1.5 + Math.Sin( (double)i * 0.2 ); y2 = 3.0 * ( 1.5 + Math.Sin( (double)i * 0.2 ) ); list1.Add( x, y1 ); list2.Add( x, y2 ); }
// Generate a red curve with diamond // symbols, and "Porsche" in the legend LineItem myCurve = myPane.AddCurve( "Porsche", list1, Color.Red, SymbolType.Diamond );
// Generate a blue curve with circle // symbols, and "Piper" in the legend LineItem myCurve2 = myPane.AddCurve( "Piper", list2, Color.Blue, SymbolType.Circle );
// Tell ZedGraph to refigure the // axes since the data have changed zgc.AxisChange(); }