# Clustering Testing Data

### read data from csv

In [1]:
import pandas as pd
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.externals import joblib

In [2]:
df = pd.read_csv('persona_clustering.csv', skiprows=100)

In [3]:
df.shape

(899, 9)

In [4]:
df.head()

Unnamed: 0,49,885779,396124,78799,433192,4,33,42,34
0,15,481550,192804,372743,455864,14,11,18,15
1,1,322903,144543,371111,312442,0,41,27,10
2,28,272502,484961,469271,393664,24,39,47,48
3,63,35813,177734,11623,498100,47,32,20,24
4,21,344523,111139,442844,461454,3,23,44,22


### check are there any null value

In [5]:
print(df.isna().sum())

49        0
885779    0
396124    0
78799     0
433192    0
4         0
33        0
42        0
34        0
dtype: int64


### replace null value with 0

In [6]:
df.fillna(0, inplace=True)

### identify min max data

In [7]:
df.describe()

Unnamed: 0,49,885779,396124,78799,433192,4,33,42,34
count,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0,899.0
mean,48.177976,483494.46941,247527.017798,249751.424917,255495.844271,25.330367,25.064516,25.85762,25.026696
std,28.258856,288617.08727,142269.493565,148528.527991,146071.771472,14.770813,14.610239,14.73197,14.786354
min,1.0,54.0,1300.0,713.0,2114.0,0.0,0.0,0.0,0.0
25%,23.0,226578.0,126838.5,115104.0,126419.5,12.0,13.0,13.0,12.0
50%,47.0,491487.0,244220.0,251357.0,254235.0,25.0,25.0,27.0,24.0
75%,71.0,733718.0,365692.5,382152.5,383652.5,38.0,37.0,37.5,38.0
max,100.0,999537.0,499514.0,499553.0,499362.0,50.0,50.0,50.0,50.0


### remove outlier using z-score

In [8]:
z = np.abs(stats.zscore(df))

In [9]:
z[:5]

array([[1.17472709, 0.00674094, 0.38485748, 0.82852795, 1.37247386,
        0.76750506, 0.96318376, 0.53366886, 0.67848218],
       [1.67042277, 0.55672678, 0.72426871, 0.81753405, 0.39006752,
        1.71584775, 1.09131371, 0.07758747, 1.01682004],
       [0.71443824, 0.73145328, 1.66983196, 1.47878506, 0.94641876,
        0.09011742, 0.95434721, 1.43593488, 1.55454765],
       [0.52480097, 1.55198942, 0.49084217, 1.60414285, 1.66178034,
        1.46787415, 0.47496447, 0.39783412, 0.06947405],
       [0.96228608, 0.48177614, 0.95919324, 1.3007606 , 1.41076402,
        1.51263146, 0.14138477, 1.23218277, 0.20480919]])

In [10]:
z_upper_3 = np.where(z>3)

In [11]:
row_drop = z_upper_3[0]

In [12]:
len(row_drop)

0

In [13]:
len(set(row_drop))

0

In [14]:
df_no_outlier = df.drop(row_drop)

In [15]:
df_no_outlier.shape

(899, 9)

In [16]:
df_no_outlier.head()

Unnamed: 0,49,885779,396124,78799,433192,4,33,42,34
0,15,481550,192804,372743,455864,14,11,18,15
1,1,322903,144543,371111,312442,0,41,27,10
2,28,272502,484961,469271,393664,24,39,47,48
3,63,35813,177734,11623,498100,47,32,20,24
4,21,344523,111139,442844,461454,3,23,44,22


### Normalize the data (0-1)

In [17]:
df_scaled = MinMaxScaler().fit_transform(df_no_outlier)

  return self.partial_fit(X, y)


In [18]:
df_scaled[:5]

array([[0.14141414, 0.48174506, 0.38438101, 0.74579023, 0.91252252,
        0.28      , 0.22      , 0.36      , 0.3       ],
       [0.        , 0.323016  , 0.287513  , 0.74251864, 0.624091  ,
        0.        , 0.82      , 0.54      , 0.2       ],
       [0.27272727, 0.27258893, 0.97078966, 0.93929516, 0.78743404,
        0.48      , 0.78      , 0.94      , 0.96      ],
       [0.62626263, 0.0357775 , 0.35413296, 0.02187074, 0.99746203,
        0.94      , 0.64      , 0.4       , 0.48      ],
       [0.2020202 , 0.34464718, 0.2204655 , 0.88631826, 0.9237644 ,
        0.06      , 0.46      , 0.88      , 0.44      ]])

### testing using kmeans model

#### load model

In [19]:
kmeans = joblib.load('model_cluster.pkl')

In [23]:
predict_clusters = kmeans.predict(df_scaled)

In [24]:
predict_clusters

array([0, 6, 3, 0, 6, 1, 8, 0, 2, 6, 2, 5, 4, 1, 8, 6, 9, 1, 9, 0, 3, 9,
       1, 6, 5, 6, 1, 3, 5, 4, 2, 0, 2, 4, 2, 1, 5, 6, 3, 3, 5, 6, 8, 8,
       9, 5, 9, 7, 3, 4, 4, 6, 0, 5, 4, 1, 0, 6, 7, 4, 6, 2, 5, 6, 4, 1,
       7, 4, 5, 0, 3, 5, 1, 1, 5, 5, 3, 0, 7, 3, 9, 6, 9, 0, 3, 4, 8, 5,
       3, 2, 9, 7, 5, 5, 4, 5, 5, 0, 2, 5, 9, 0, 5, 3, 1, 0, 5, 6, 6, 5,
       2, 3, 8, 5, 4, 1, 4, 4, 4, 5, 1, 2, 1, 7, 5, 2, 2, 3, 6, 8, 6, 8,
       7, 6, 6, 9, 9, 3, 6, 9, 2, 4, 3, 3, 8, 6, 9, 7, 3, 3, 3, 5, 5, 7,
       2, 3, 5, 5, 9, 0, 8, 3, 1, 3, 3, 2, 2, 5, 4, 8, 5, 3, 3, 0, 9, 3,
       5, 6, 3, 0, 6, 5, 5, 3, 9, 6, 3, 2, 4, 1, 4, 8, 5, 0, 6, 4, 6, 5,
       5, 2, 3, 1, 5, 2, 3, 2, 4, 9, 4, 8, 2, 1, 2, 9, 4, 5, 4, 2, 3, 8,
       9, 6, 6, 6, 5, 3, 5, 5, 4, 1, 5, 7, 2, 6, 5, 2, 2, 0, 5, 5, 8, 6,
       0, 5, 0, 9, 8, 6, 0, 3, 2, 3, 5, 3, 5, 3, 5, 2, 0, 8, 5, 2, 0, 9,
       4, 4, 9, 1, 1, 4, 2, 5, 8, 2, 3, 0, 2, 5, 4, 5, 5, 7, 4, 5, 5, 1,
       3, 0, 0, 7, 6, 0, 1, 5, 5, 9, 5, 0, 5, 7, 8,

In [25]:
unique_elements, counts_elements = np.unique(predict_clusters, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[  0   1   2   3   4   5   6   7   8   9]
 [ 79  91 101 130  82 148  85  42  59  82]]
