# Clustering

In [1]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn import preprocessing

### Lectura del dataset preprocesado

In [2]:
FILE_PATH = "../Data/numerical.csv"

In [3]:
df = pd.read_csv(FILE_PATH, encoding='utf-8', error_bad_lines=False)

In [4]:
df

Unnamed: 0,brand,model,OS,RAM,approx_price_EUR,battery_removable,battery_type,battery_mah,CPU_cores,CPU_speed,primary_camera_mp,secondary_camera_mp,internal_memory_gb
0,Acer,Iconia Talk S,Android,2,170.0,False,Li-Ion,3400.0,4,1.30,13,2,32
1,Acer,Liquid Z6 Plus,Android,3,250.0,True,Li-Po,4080.0,8,1.30,13,5,32
2,Acer,Liquid Z6,Android,1,120.0,True,Li-Ion,2000.0,4,1.25,8,2,8
3,Acer,Iconia Tab 10 A3-A40,Android,2,230.0,False,Li-Ion,,4,1.30,5,2,64
4,Acer,Liquid X2,Android,3,230.0,True,Li-Po,4020.0,8,1.30,13,13,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2485,ZTE,Optik,Android,1,230.0,False,Li-Ion,4000.0,2,1.20,5,2,16
2486,ZTE,Light Tab 2 V9A,Android,512,250.0,False,Li-Ion,3400.0,1,1.40,3.2,VGA,4
2487,ZTE,Tania,Microsoft,512,210.0,True,Li-Ion,1400.0,1,1.00,5,No,4
2488,ZTE,Blade,Android,512,170.0,True,Li-Ion,1250.0,1,0.60,3.15,No,512


### Eliminación valores NULL

In [5]:
df = df.dropna()
# Reset a los valores del index en el dataframe #
df = df.reset_index(drop=True)

In [6]:
df

Unnamed: 0,brand,model,OS,RAM,approx_price_EUR,battery_removable,battery_type,battery_mah,CPU_cores,CPU_speed,primary_camera_mp,secondary_camera_mp,internal_memory_gb
0,Acer,Iconia Talk S,Android,2,170.0,False,Li-Ion,3400.0,4,1.30,13,2,32
1,Acer,Liquid Z6 Plus,Android,3,250.0,True,Li-Po,4080.0,8,1.30,13,5,32
2,Acer,Liquid Z6,Android,1,120.0,True,Li-Ion,2000.0,4,1.25,8,2,8
3,Acer,Liquid X2,Android,3,230.0,True,Li-Po,4020.0,8,1.30,13,13,32
4,Acer,Liquid Zest Plus,Android,2,200.0,False,Li-Ion,5000.0,4,1.30,13,5,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,ZTE,Optik,Android,1,230.0,False,Li-Ion,4000.0,2,1.20,5,2,16
2268,ZTE,Light Tab 2 V9A,Android,512,250.0,False,Li-Ion,3400.0,1,1.40,3.2,VGA,4
2269,ZTE,Tania,Microsoft,512,210.0,True,Li-Ion,1400.0,1,1.00,5,No,4
2270,ZTE,Blade,Android,512,170.0,True,Li-Ion,1250.0,1,0.60,3.15,No,512


### Conversión de columnas a valores numéricos

In [7]:
columns = ['brand', 'OS', 'battery_removable', 'battery_type', 'primary_camera_mp']
encoder = preprocessing.LabelEncoder()
df_encoded = df[columns].apply(encoder.fit_transform)
df_encoded

Unnamed: 0,brand,OS,battery_removable,battery_type,primary_camera_mp
0,0,1,0,0,8
1,0,1,1,1,8
2,0,1,1,0,34
3,0,1,1,1,8
4,0,1,0,0,8
...,...,...,...,...,...
2267,60,1,0,0,32
2268,60,1,0,0,29
2269,60,10,1,0,32
2270,60,1,1,0,28


In [8]:
df = df.drop(columns, axis=1)
df

Unnamed: 0,model,RAM,approx_price_EUR,battery_mah,CPU_cores,CPU_speed,secondary_camera_mp,internal_memory_gb
0,Iconia Talk S,2,170.0,3400.0,4,1.30,2,32
1,Liquid Z6 Plus,3,250.0,4080.0,8,1.30,5,32
2,Liquid Z6,1,120.0,2000.0,4,1.25,2,8
3,Liquid X2,3,230.0,4020.0,8,1.30,13,32
4,Liquid Zest Plus,2,200.0,5000.0,4,1.30,5,16
...,...,...,...,...,...,...,...,...
2267,Optik,1,230.0,4000.0,2,1.20,2,16
2268,Light Tab 2 V9A,512,250.0,3400.0,1,1.40,VGA,4
2269,Tania,512,210.0,1400.0,1,1.00,No,4
2270,Blade,512,170.0,1250.0,1,0.60,No,512


In [9]:
df = df.join(df_encoded)
df

Unnamed: 0,model,RAM,approx_price_EUR,battery_mah,CPU_cores,CPU_speed,secondary_camera_mp,internal_memory_gb,brand,OS,battery_removable,battery_type,primary_camera_mp
0,Iconia Talk S,2,170.0,3400.0,4,1.30,2,32,0,1,0,0,8
1,Liquid Z6 Plus,3,250.0,4080.0,8,1.30,5,32,0,1,1,1,8
2,Liquid Z6,1,120.0,2000.0,4,1.25,2,8,0,1,1,0,34
3,Liquid X2,3,230.0,4020.0,8,1.30,13,32,0,1,1,1,8
4,Liquid Zest Plus,2,200.0,5000.0,4,1.30,5,16,0,1,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,Optik,1,230.0,4000.0,2,1.20,2,16,60,1,0,0,32
2268,Light Tab 2 V9A,512,250.0,3400.0,1,1.40,VGA,4,60,1,0,0,29
2269,Tania,512,210.0,1400.0,1,1.00,No,4,60,10,1,0,32
2270,Blade,512,170.0,1250.0,1,0.60,No,512,60,1,1,0,28


### Obtención de características

In [None]:
features = ['approx_price_EUR', 'battery_mah', 'CPU_cores']