In [1]:
from numpy import *


def loadDataSet(fileName):  #general function to parse tab -delimited floats
    dataMat = []  #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        for i in range(len(curLine)):
            curLine[i] = float(curLine[i])
        dataMat.append(curLine)
    return dataMat


def distEclud(vecA, vecB):
    return sqrt(sum(power(vecA - vecB, 2)))


def randCent(dataSet, k):
    n = shape(dataSet)[1]
    centroids = mat(zeros((k, n)))
    for j in range(n):
        minJ = min(dataSet[:, j])
        rangeJ = float(max(dataSet[:, j]) - minJ)
        centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))
    return centroids


datMat = mat(loadDataSet('testSet.txt'))
randCent(datMat, 2)

matrix([[-0.41027118, -3.38196001],
        [ 3.62454065,  2.75254609]])

In [2]:
datMat[:, 0].min(), datMat[:, 0].max(), datMat[:, 1].min(), datMat[:, 1].max()

(-5.379713, 4.838138, -4.232586, 5.1904)

In [3]:
distEclud(datMat[0], datMat[1])

5.184632816681332

In [4]:
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m, 2)))  #create mat to assign data points
    #to a centroid, also holds SE of each point
    centroids = createCent(dataSet, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = inf
            minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j, :], dataSet[i, :])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i, 0] != minIndex: clusterChanged = True
            clusterAssment[i, :] = minIndex, minDist**2
        print(centroids)
        for cent in range(k):  #recalculate centroids
            ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)
                                 [0]]  #get all the point in this cluster
            centroids[cent, :] = mean(ptsInClust,
                                      axis=0)  #assign centroid to mean
    return centroids, clusterAssment


kMeans(datMat, 4)

[[-3.09607558 -4.16880927]
 [ 3.80978619 -2.45850965]
 [ 1.58581126  4.35758987]
 [-4.49084032 -3.74599714]]
[[-2.5660635  -3.0692904 ]
 [ 2.8692781  -2.54779119]
 [ 0.14460654  3.09399208]
 [-4.01947533 -2.16142433]]
[[-2.38267313 -3.20383625]
 [ 2.8692781  -2.54779119]
 [ 0.14460654  3.09399208]
 [-3.91663957 -2.21423614]]
[[-2.32402057 -3.35442629]
 [ 2.8692781  -2.54779119]
 [ 0.14460654  3.09399208]
 [-3.84174633 -2.20993413]]
[[-2.28373583 -3.578145  ]
 [ 2.8692781  -2.54779119]
 [ 0.14460654  3.09399208]
 [-3.76199525 -2.19757038]]


(matrix([[-2.28373583, -3.578145  ],
         [ 2.8692781 , -2.54779119],
         [ 0.14460654,  3.09399208],
         [-3.76199525, -2.19757038]]),
 matrix([[2.00000000e+00, 3.71216595e+00],
         [2.00000000e+00, 1.30568336e+01],
         [1.00000000e+00, 5.82592950e+00],
         [3.00000000e+00, 3.97314928e+00],
         [2.00000000e+00, 7.14381631e-01],
         [3.00000000e+00, 1.39444593e+01],
         [1.00000000e+00, 6.41909733e+00],
         [3.00000000e+00, 2.99424571e-01],
         [2.00000000e+00, 8.61879660e+00],
         [2.00000000e+00, 1.09066425e+01],
         [1.00000000e+00, 2.19619091e+00],
         [0.00000000e+00, 4.82351606e-01],
         [2.00000000e+00, 1.65245813e+01],
         [2.00000000e+00, 5.16625612e+00],
         [1.00000000e+00, 8.96547453e+00],
         [0.00000000e+00, 3.72592046e+00],
         [2.00000000e+00, 9.53061512e+00],
         [2.00000000e+00, 9.35852077e-01],
         [1.00000000e+00, 6.60448582e-01],
         [3.00000000e+00, 3.27940

In [6]:
def biKmeans(dataSet, k, distMeas=distEclud):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m, 2)))
    centroid0 = mean(dataSet, axis=0).tolist()[0]
    centList = [centroid0]  #create a list with one centroid
    for j in range(m):  #calc initial Error
        clusterAssment[j, 1] = distMeas(mat(centroid0), dataSet[j, :])**2
    while (len(centList) < k):
        lowestSSE = inf
        for i in range(len(centList)):
            ptsInCurrCluster = dataSet[nonzero(
                clusterAssment[:, 0].A ==
                i)[0], :]  #get the data points currently in cluster i
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
            sseSplit = sum(
                splitClustAss[:, 1])  #compare the SSE to the currrent minimum
            sseNotSplit = sum(
                clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0], 1])
            print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0],
                     0] = len(centList)  #change 1 to 3,4, or whatever
        bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0],
                     0] = bestCentToSplit
        print('the bestCentToSplit is: ', bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0, :].tolist()[
            0]  #replace a centroid with two best centroids
        centList.append(bestNewCents[1, :].tolist()[0])
        clusterAssment[nonzero(
            clusterAssment[:, 0].A == bestCentToSplit
        )[0], :] = bestClustAss  #reassign new clusters, and SSE
    return mat(centList), clusterAssment


biKmeans(datMat, 4)

[[ 2.80217871  4.4455403 ]
 [ 1.89196769 -1.81039396]]
[[ 0.34421986  3.07632743]
 [-0.48895795 -2.54604695]]
[[ 0.08249337  2.94802785]
 [-0.2897198  -2.83942545]]
sseSplit, and notSplit:  792.9168565373268 0.0
the bestCentToSplit is:  0
the len of bestClustAss is:  80
[[3.74843532 3.29378414]
 [0.1190672  1.90485586]]
[[ 2.71358074  3.11839563]
 [-2.29801424  2.79388557]]
[[ 2.6265299   3.10868015]
 [-2.46154315  2.78737555]]
sseSplit, and notSplit:  66.36683512000786 466.63278133614426
[[-3.00689692 -0.81281695]
 [ 0.56446681 -1.4708354 ]]
[[-3.53973889 -2.89384326]
 [ 2.65077367 -2.79019029]]
sseSplit, and notSplit:  84.25921395268443 326.2840752011824
the bestCentToSplit is:  1
the len of bestClustAss is:  40
[[ 3.91126049  4.22239662]
 [-2.54451044  3.29644494]]
[[ 2.6265299   3.10868015]
 [-2.46154315  2.78737555]]
sseSplit, and notSplit:  66.36683512000786 84.25921395268443
[[-4.08032343 -2.2534253 ]
 [-3.2328623  -2.44447403]]
[[-4.332724   -2.90944687]
 [-2.96302245 -2.882495

(matrix([[ 2.6265299 ,  3.10868015],
         [-3.53973889, -2.89384326],
         [ 2.65077367, -2.79019029],
         [-2.46154315,  2.78737555]]),
 matrix([[ 0.        ,  2.3201915 ],
         [ 3.        ,  1.39004893],
         [ 2.        ,  7.46974076],
         [ 1.        ,  3.60477283],
         [ 0.        ,  2.7696782 ],
         [ 3.        ,  2.80101213],
         [ 2.        ,  5.10287596],
         [ 1.        ,  1.37029303],
         [ 0.        ,  2.29348924],
         [ 3.        ,  0.64596748],
         [ 2.        ,  1.72819697],
         [ 1.        ,  0.60909593],
         [ 0.        ,  2.51695402],
         [ 3.        ,  0.13871642],
         [ 2.        ,  9.12853034],
         [ 2.        , 10.63785781],
         [ 0.        ,  2.39726914],
         [ 3.        ,  3.1024236 ],
         [ 2.        ,  0.40704464],
         [ 1.        ,  0.49023594],
         [ 0.        ,  0.13870613],
         [ 3.        ,  0.510241  ],
         [ 2.        ,  0.9939764 ],

In [17]:
def biKmeans(x, k):
    #第一列是结果,第二列是距离
    pred = zeros((N, 2))

    #所有x的均值
    mean_x = mean(x, axis=0).tolist()[0]

    #中心点,初始化为1个,就是所有x的中心点
    cents = [mean_x]

    #计算所有点和这个中心点的距离,初始化pred
    for i in range(N):
        pred[i, 1] = distance(mean_x, x[i])**2

    #只要不是每个点1个中心,循环就不停止
    while (len(cents) < k):

        min_d = inf

        #遍历所有中心点,第一次的时候只有1个中心点
        for ki in range(len(cents)):

            #取属于这个中心点的x,第一次的时候是所有x
            x_ki = x[pred[:, 0] == ki]

            #调用k均值分为两类
            #cents_ki=2个新的中心点
            #pred_ki 第一列是分类,第二列是距离
            cents_ki, pred_ki = kMeans(x_ki, 2)

            #2分类后的距离求和,不论是分到哪一类的
            distance_ki = sum(pred_ki[:, 1])

            #不属于这个中心点的x,距离求和,第一次应该是0
            distance_not_ki = sum(pred[pred[:, 0] != ki, 1])

            #尝试把1个中心点,拆分为2个中心点,并且求哪一个中心点拆分后的总体距离是最小的
            if (distance_ki + distance_not_ki) < min_d:
                min_ki = ki
                min_cents_ki = cents_ki
                min_pred_ki = pred_ki.copy()
                min_d = distance_ki + distance_not_ki

        #第一列是分类,第二列是距离
        #被分为第二类的,设置为新的中心点
        #被分为第一类的,代替原来被拆分的中心点
        min_pred_ki[min_pred_ki[:, 0] == 1, 0] = len(cents)
        min_pred_ki[min_pred_ki[:, 0] == 0, 0] = min_ki

        #min_cents_ki=2个新的中心点
        cents[min_ki] = min_cents_ki[0].tolist()[0]
        cents.append(min_cents_ki[1].tolist()[0])
        pred[pred[:, 0] == min_ki] = min_pred_ki

    return mat(cents), pred


x = datMat
N = len(x)
distance = distEclud
biKmeans(x, 4)

[[ 0.38994031  4.11754519]
 [-4.88010218 -3.50275644]]
[[ 0.71260382  1.97554384]
 [-1.46397493 -3.14776987]]
[[ 0.49695427  2.46978598]
 [-0.8757714  -3.05132209]]
[[ 0.30731902  2.68529874]
 [-0.58118311 -3.00334459]]
[[ 0.18713124  2.8560699 ]
 [-0.40926764 -2.89114795]]
[[ 0.08249337  2.94802785]
 [-0.2897198  -2.83942545]]


IndexError: too many indices for array: array is 2-dimensional, but 3 were indexed

In [24]:
import urllib
import json


def geoGrab(stAddress, city):
    apiStem = 'http://where.yahooapis.com/geocode?'
    params = {}
    params['flags'] = 'J'
    params['appid'] = 'aaa0VN6k'
    params['location'] = '%s %s' % (stAddress, city)
    url_params = urllib.parse.urlencode(params)
    yahooApi = apiStem + url_params
    print(yahooApi)
    c = urllib.request.urlopen(yahooApi)
    return json.loads(c.read())


from time import sleep


def massPlaceFind(fileName):
    fw = open('places.txt', 'w')
    for line in open(fileName).readlines():
        line = line.strip()
        lineArr = line.split('\t')
        retDict = geoGrab(lineArr[1], lineArr[2])
        if retDict['ResultSet']['Error'] == 0:
            lat = float(retDict['ResultSet']['Results'][0]['latitude'])
            lng = float(retDict['ResultSet']['Results'][0]['longitude'])
            print("%s\t%f\t%f" % (lineArr[0], lat, lng))
            fw.write('%s\t%f\t%f\n' % (line, lat, lng))
        else:
            print("error fetching")
        sleep(1)
    fw.close()


geoGrab('l VA Center', 'Augusta, ME')

http://where.yahooapis.com/geocode?flags=J&appid=aaa0VN6k&location=l+VA+Center+Augusta%2C+ME


URLError: <urlopen error [Errno -2] Name or service not known>