In [1]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import metrics

In [2]:
X = np.genfromtxt('./tables/svdtable.txt', delimiter='\t')
svd = TruncatedSVD(n_components=21, n_iter=7, random_state=42)
svd.fit(X)

TruncatedSVD(algorithm='randomized', n_components=21, n_iter=7,
       random_state=42, tol=0.0)

In [3]:
V = svd.components_[:20]#eliminate noise
user_pre = np.inner(X, V)
user_pre[0]

array([ 5.5107933 , -2.71790497, -2.68114297,  1.86296766,  0.05759208,
       -1.97361137, -1.92632456, -0.8316969 ,  0.98801495,  0.08046625,
       -0.72573739, -0.167905  ,  0.64687978, -0.02839384,  0.01019411,
        0.03955321, -0.03060934, -0.53381717,  0.18704668, -0.14261981])

In [4]:
kmeans_model = KMeans(n_clusters=81, random_state=1).fit(user_pre)
labels = kmeans_model.labels_

In [5]:
labels[:10]

array([36, 46, 47, 46,  3, 45, 11, 16, 31,  7], dtype=int32)

In [6]:
global X

In [7]:
np.unique(labels)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80], dtype=int32)

In [8]:
global labels

## Fill Zeros
### Fill by mean score of each clusters

In [9]:
def SelCluster(cluster_n):
    select = np.where(labels==cluster_n)[0]
    return select

In [10]:
select = SelCluster(36)

In [11]:
temp_X = X[select]

In [12]:
temp_X[:,0]

array([ 4.25      ,  4.18181818,  3.41584158, ...,  4.99610136,
        3.73684211,  3.58823529])

In [13]:
notzero = np.where(temp_X[:,0]!=0)[0]

In [14]:
temp_X[:,0][notzero]

array([ 4.25      ,  4.18181818,  3.41584158, ...,  4.99610136,
        3.73684211,  3.58823529])

In [15]:
np.sum(temp_X[:,0][notzero])

14971.964350298636

In [16]:
len(temp_X[:,0][notzero])

3807

In [17]:
np.sum(temp_X[:,0][notzero])/len(temp_X[:,0][notzero])

3.932746086235523

In [18]:
len(temp_X[:,0])

3807

In [19]:
temp_X.shape

(3807, 216)

In [20]:
temp_X[:,215]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [21]:
def CalMean(temp_ind):
    notzero = np.where(temp_ind!=0)[0]
    if len(temp_ind[notzero])==0:
        return 0
    else:
        return np.sum(temp_ind[notzero])/len(temp_ind[notzero])

In [22]:
#calculate mean score of each song of a cluster 
def MeanWOZero(select):
    temp_X = X[select]
    mean = list(map(lambda ind: CalMean(temp_X[:,ind]), np.arange(216))) #216 genre
    return mean

In [23]:
X[select]

array([[ 4.25      ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 4.18181818,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.41584158,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 4.99610136,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.73684211,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.58823529,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [24]:
X[select][0]

array([ 4.25,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,

In [25]:
def FillZeroByMean(mean, select, ind):
    temp_sc = X[select[ind]]
    ls = [temp_sc[x] if temp_sc[x]!=0 else mean[x] for x in range(0,len(temp_sc))]
    return np.array(ls)

In [26]:
select = SelCluster(36)
mean = MeanWOZero(select)

In [27]:
FillZeroByMean(mean, select, 0)

array([ 4.25      ,  2.90942029,  0.        ,  0.        ,  3.91071429,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  2.54054054,
        2.33333333,  0.        ,  0.        ,  4.30952381,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.84027778,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.41084011,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  3.81164966,  0.        ,  0.        ,
        4.18484848,  4.33333333,  0.        ,  0.        ,  1.2359127 ,
        4.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        3.52571318,  0.        ,  1.        ,  0.        ,  2.76

In [28]:
X[select].shape

(3807, 216)

In [29]:
X[select].shape[0]

3807

In [30]:
#X[select] = list(map(lambda ind: FillZeroByMean(mean, select, ind), np.arange(X[select].shape[0])))

In [31]:
len(select)

3807

In [32]:
#fill zeros by average of a cluster
for i in range(0,81):
    select = SelCluster(i)
    mean = MeanWOZero(select)
    #X[select] = list(map(FillZeroByMean(mean, select, ind)))
    for j in range(0,len(select)):
        X[select[j]] = FillZeroByMean(mean, select, j)
    print("Cluster: ", i)
    
#FillZeroByUnknown()

Cluster:  0
Cluster:  1
Cluster:  2
Cluster:  3
Cluster:  4
Cluster:  5
Cluster:  6
Cluster:  7
Cluster:  8
Cluster:  9
Cluster:  10
Cluster:  11
Cluster:  12
Cluster:  13
Cluster:  14
Cluster:  15
Cluster:  16
Cluster:  17
Cluster:  18
Cluster:  19
Cluster:  20
Cluster:  21
Cluster:  22
Cluster:  23
Cluster:  24
Cluster:  25
Cluster:  26
Cluster:  27
Cluster:  28
Cluster:  29
Cluster:  30
Cluster:  31
Cluster:  32
Cluster:  33
Cluster:  34
Cluster:  35
Cluster:  36
Cluster:  37
Cluster:  38
Cluster:  39
Cluster:  40
Cluster:  41
Cluster:  42
Cluster:  43
Cluster:  44
Cluster:  45
Cluster:  46
Cluster:  47
Cluster:  48
Cluster:  49
Cluster:  50
Cluster:  51
Cluster:  52
Cluster:  53
Cluster:  54
Cluster:  55
Cluster:  56
Cluster:  57
Cluster:  58
Cluster:  59
Cluster:  60
Cluster:  61
Cluster:  62
Cluster:  63
Cluster:  64
Cluster:  65
Cluster:  66
Cluster:  67
Cluster:  68
Cluster:  69
Cluster:  70
Cluster:  71
Cluster:  72
Cluster:  73
Cluster:  74
Cluster:  75
Cluster:  76
Cluster: 

In [33]:
X[0]

array([ 4.25      ,  2.90942029,  0.        ,  0.        ,  3.91071429,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  2.54054054,
        2.33333333,  0.        ,  0.        ,  4.30952381,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.84027778,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.41084011,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  3.81164966,  0.        ,  0.        ,
        4.18484848,  4.33333333,  0.        ,  0.        ,  1.2359127 ,
        4.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        3.52571318,  0.        ,  1.        ,  0.        ,  2.76

In [34]:
#save
np.savetxt("./tables/svdKmeansMean1.txt", X, delimiter="\t")

### Fill zeros by unknown score

In [37]:
#fill zeros by unknown
def FillZeroByUnknown(row):
    unknown = row[0]#genre unknown is the first element of an row
    ls = [row[x] if row[x]!=0 else unknown for x in range(0,len(row))]
    return np.array(ls)

In [38]:
FillZeroByUnknown(X[0])

array([ 4.25      ,  2.90942029,  4.25      ,  4.25      ,  3.91071429,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  2.54054054,
        2.33333333,  4.25      ,  4.25      ,  4.30952381,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  1.84027778,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        1.41084011,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  3.81164966,  4.25      ,  4.25      ,
        4.18484848,  4.33333333,  4.25      ,  4.25      ,  1.2359127 ,
        4.        ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        3.52571318,  4.25      ,  1.        ,  4.25      ,  2.76

In [40]:
Y = list(map(lambda ind: FillZeroByUnknown(X[ind]), np.arange(len(X))))

In [41]:
Y[0]

array([ 4.25      ,  2.90942029,  4.25      ,  4.25      ,  3.91071429,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  2.54054054,
        2.33333333,  4.25      ,  4.25      ,  4.30952381,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  1.84027778,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        1.41084011,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  3.81164966,  4.25      ,  4.25      ,
        4.18484848,  4.33333333,  4.25      ,  4.25      ,  1.2359127 ,
        4.        ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        3.52571318,  4.25      ,  1.        ,  4.25      ,  2.76

In [43]:
np.savetxt("./tables/svdKmeansMean2.txt", Y, delimiter="\t")

In [44]:
Z = np.genfromtxt('./tables/svdKmeansMean1.txt', delimiter='\t')
Z[0]

array([ 4.25      ,  2.90942029,  0.        ,  0.        ,  3.91071429,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  2.54054054,
        2.33333333,  0.        ,  0.        ,  4.30952381,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.84027778,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.41084011,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  3.81164966,  0.        ,  0.        ,
        4.18484848,  4.33333333,  0.        ,  0.        ,  1.2359127 ,
        4.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        3.52571318,  0.        ,  1.        ,  0.        ,  2.76

In [45]:
Z = np.genfromtxt('./tables/svdKmeansMean2.txt', delimiter='\t')
Z[0]

array([ 4.25      ,  2.90942029,  4.25      ,  4.25      ,  3.91071429,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  2.54054054,
        2.33333333,  4.25      ,  4.25      ,  4.30952381,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  1.84027778,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        1.41084011,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  3.81164966,  4.25      ,  4.25      ,
        4.18484848,  4.33333333,  4.25      ,  4.25      ,  1.2359127 ,
        4.        ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        4.25      ,  4.25      ,  4.25      ,  4.25      ,  4.25      ,
        3.52571318,  4.25      ,  1.        ,  4.25      ,  2.76