In [1]:
import numpy as np
import pyodbc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display # Allows the use of display() for DataFrames
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from scipy.special import boxcox1p

# Import supplementary visualizations code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

pd.options.display.max_columns = 300
pd.options.display.max_colwidth = 30
pd.options.display.max_rows = 100

In [6]:
box_transform = pd.read_csv('box_transformed.csv', sep='\t', encoding='utf-8')
reduced_data = pd.read_csv('reduced_data.csv', sep='\t', encoding='utf-8')

In [4]:
type(box_transform)

pandas.core.frame.DataFrame

In [9]:
box_transform.head(5)

Unnamed: 0.1,Unnamed: 0,DTC,ROOFSYSTEM,DED,AUTOLIM,BUILDLIM,OSTRUCLIM,CONTLIM,TELIM,TOTLIM,TIV,AUTOVAL,BUILDVAL,OSTRUCVAL,CONTVAL,TEVAL,PREMIUM,GROUNDUPLOSS,GROSSLOSS,PRECATLOSS
0,0,0.0,0.0,14.924283,0.0,110.003842,0.0,0.0,0.0,110.003842,110.003842,0.0,110.003842,0.0,0.0,0.0,28.357254,9.806826,9.537913,9.579764
1,1,0.0,0.0,36.001,0.0,193.231826,0.0,87.372251,0.0,195.464864,195.464864,0.0,193.231826,0.0,87.372251,0.0,30.919179,20.180409,19.615099,17.939086
2,2,0.0,0.0,29.637538,0.0,91.854996,0.0,39.378386,0.0,92.844548,92.844548,0.0,91.854996,0.0,39.378386,0.0,15.335622,7.288878,6.229872,6.253107
3,3,0.0,0.0,0.0,0.0,0.0,0.0,52.919176,0.0,52.919176,52.919176,0.0,0.0,0.0,52.919176,0.0,14.895889,0.665014,0.665014,0.665014
4,4,0.0,0.0,0.0,0.0,0.0,0.0,22.752955,0.0,22.752955,22.752955,0.0,0.0,0.0,22.752955,0.0,9.401476,0.105733,0.105733,0.105733


In [10]:
# Apply Principal Component Analysis (PCA) on transformed data and select only 2 principal components
pca_2d = PCA(n_components=2).fit(box_transform)
# Apply PCA transform on the transformed data fit
reduced_data = pca_2d.transform(box_transform)
# Create dataframe for the transformed data
reduced_data = pd.DataFrame(reduced_data, columns=['Dimension 1', 'Dimension 2'])

In [11]:
reduced_data.head(5)

Unnamed: 0,Dimension 1,Dimension 2
0,49999.691449,80.095131
1,49998.648079,263.912232
2,49997.728443,79.141887
3,49996.822097,-11.318129
4,49995.811742,-52.671921


In [12]:
# Apply your clustering algorithm of choice to the reduced data 
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

score_table = {}
for num_clusters in range(2,16):
    clusterer = GaussianMixture(n_components=num_clusters).fit(reduced_data)
    # Predict the cluster for each data point
    preds = clusterer.predict(reduced_data)
    # Find the cluster centers
    centers = clusterer.means_
    # Calculate the mean silhouette coefficient for the number of clusters chosen
    score = silhouette_score(reduced_data, preds)
    score_table[num_clusters] = score
    print "Number of clusters: {}, Silhouette coefficient: {}".format(num_clusters, score)

MemoryError: 

In [20]:
preds[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [17]:
reduced_data[:5]

Unnamed: 0,Dimension 1,Dimension 2
0,49999.691449,80.095131
1,49998.648079,263.912232
2,49997.728443,79.141887
3,49996.822097,-11.318129
4,49995.811742,-52.671921


In [13]:
box_transform.describe()

Unnamed: 0.1,Unnamed: 0,DTC,ROOFSYSTEM,DED,AUTOLIM,BUILDLIM,OSTRUCLIM,CONTLIM,TELIM,TOTLIM,TIV,AUTOVAL,BUILDVAL,OSTRUCVAL,CONTVAL,TEVAL,PREMIUM,GROUNDUPLOSS,GROSSLOSS,PRECATLOSS
count,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0
mean,49999.790978,1.570511,0.932452,6.090671,12.330106,26.473871,13.424657,41.753645,17.117612,63.432586,65.062272,12.330106,27.959872,14.260273,43.08477,17.117612,14.073034,6.481848,5.953908,5.951544
std,28867.655488,3.186724,1.288144,8.575767,17.084131,41.772725,21.78069,36.878701,26.068529,33.720254,35.744443,17.084131,44.02623,23.131338,38.246364,26.068529,6.914899,5.000714,4.868068,4.858779
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.532493,9.532493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25000.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.001,36.001,0.0,0.0,0.0,0.0,0.0,8.553156,2.643641,2.299104,2.299104
50%,50000.0,0.0,0.0,0.0,0.0,0.0,0.0,42.962813,0.0,46.871387,47.016605,0.0,0.0,0.0,42.962813,0.0,12.661021,5.122247,4.378527,4.378527
75%,74999.5,0.0,2.727171,11.921296,36.001,75.032723,39.225756,73.700355,46.651199,93.87061,97.860973,36.001,79.512015,41.705505,76.981899,46.651199,18.571971,9.857128,9.403337,9.4044
max,99999.0,15.901653,2.727171,112.504823,36.001,294.578794,105.633945,175.78317,126.377439,301.005089,301.005089,36.001,294.578794,111.923773,180.501434,126.377439,79.969749,47.398082,44.420123,43.713304


In [14]:
reduced_data.describe()

Unnamed: 0,Dimension 1,Dimension 2
count,99999.0,99999.0
mean,-6.913742e-11,1.128112e-14
std,28867.67,97.55478
min,-49999.25,-111.791
25%,-24999.73,-90.43538
50%,-0.1935257,-28.39684
75%,24999.34,90.57442
max,49999.69,466.9925
