forked from kubernetes/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
repair.go
120 lines (106 loc) · 4.35 KB
/
repair.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
Copyright 2015 The Kubernetes Authors All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"fmt"
"net"
"time"
"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
"github.com/GoogleCloudPlatform/kubernetes/pkg/registry/service"
"github.com/GoogleCloudPlatform/kubernetes/pkg/registry/service/ipallocator"
"github.com/GoogleCloudPlatform/kubernetes/pkg/util"
)
// Repair is a controller loop that periodically examines all service PortalIP allocations
// and logs any errors, and then sets the compacted and accurate list of all allocated IPs.
//
// Handles:
// * Duplicate PortalIP assignments caused by operator action or undetected race conditions
// * PortalIPs that do not match the current portal network
// * Allocations to services that were not actually created due to a crash or powerloss
// * Migrates old versions of Kubernetes services into the atomic ipallocator model automatically
//
// Can be run at infrequent intervals, and is best performed on startup of the master.
// Is level driven and idempotent - all valid PortalIPs will be updated into the ipallocator
// map at the end of a single execution loop if no race is encountered.
//
// TODO: allocate new IPs if necessary
// TODO: perform repair?
type Repair struct {
interval time.Duration
registry service.Registry
network *net.IPNet
alloc service.IPRegistry
}
// NewRepair creates a controller that periodically ensures that all portalIPs are uniquely allocated across the cluster
// and generates informational warnings for a cluster that is not in sync.
func NewRepair(interval time.Duration, registry service.Registry, network *net.IPNet, alloc service.IPRegistry) *Repair {
return &Repair{
interval: interval,
registry: registry,
network: network,
alloc: alloc,
}
}
// RunUntil starts the controller until the provided ch is closed.
func (c *Repair) RunUntil(ch chan struct{}) {
util.Until(func() {
if err := c.RunOnce(); err != nil {
util.HandleError(err)
}
}, c.interval, ch)
}
// RunOnce verifies the state of the portal IP allocations and returns an error if an unrecoverable problem occurs.
func (c *Repair) RunOnce() error {
latest, err := c.alloc.Get()
if err != nil {
return fmt.Errorf("unable to refresh the service IP block: %v", err)
}
ctx := api.WithNamespace(api.NewDefaultContext(), api.NamespaceAll)
list, err := c.registry.ListServices(ctx)
if err != nil {
return fmt.Errorf("unable to refresh the service IP block: %v", err)
}
r := ipallocator.NewCIDRRange(c.network)
for _, svc := range list.Items {
if !api.IsServiceIPSet(&svc) {
continue
}
ip := net.ParseIP(svc.Spec.PortalIP)
if ip == nil {
// portal IP is broken, reallocate
util.HandleError(fmt.Errorf("the portal IP %s for service %s/%s is not a valid IP; please recreate", svc.Spec.PortalIP, svc.Name, svc.Namespace))
continue
}
switch err := r.Allocate(ip); err {
case nil:
case ipallocator.ErrAllocated:
// TODO: send event
// portal IP is broken, reallocate
util.HandleError(fmt.Errorf("the portal IP %s for service %s/%s was assigned to multiple services; please recreate", ip, svc.Name, svc.Namespace))
case ipallocator.ErrNotInRange:
// TODO: send event
// portal IP is broken, reallocate
util.HandleError(fmt.Errorf("the portal IP %s for service %s/%s is not within the service CIDR %s; please recreate", ip, svc.Name, svc.Namespace, c.network))
case ipallocator.ErrFull:
// TODO: send event
return fmt.Errorf("the service CIDR %s is full; you must widen the CIDR in order to create new services")
default:
return fmt.Errorf("unable to allocate portal IP %s for service %s/%s due to an unknown error, exiting: %v", ip, svc.Name, svc.Namespace, err)
}
}
service.SnapshotRange(latest, r)
if err := c.alloc.CreateOrUpdate(latest); err != nil {
return fmt.Errorf("unable to persist the updated service IP allocations: %v", err)
}
return nil
}