# Configuración final del modelo y del dataset

In [11]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
import pandas as pd
from sklearn import preprocessing
from kmodes.kprototypes import KPrototypes
import numpy as np
import pickle
from sklearn.cluster import KMeans

#### Lectura del dataset

In [2]:
FILE_PATH = "../Data/numerical_2.csv"

In [3]:
df = pd.read_csv(FILE_PATH, encoding='utf-8', error_bad_lines=False)

In [4]:
df

Unnamed: 0,brand,model,OS,RAM,approx_price_EUR,battery_removable,battery_type,battery_mah,CPU_cores,CPU_speed,primary_camera_mp,secondary_camera_mp,internal_memory_gb
0,Acer,Iconia Talk S,Android,2.000,170.0,False,Li-Ion,3400.0,4,1.30,13.0,2.0,32.0
1,Acer,Liquid Z6 Plus,Android,3.000,250.0,True,Li-Po,4080.0,8,1.30,13.0,5.0,32.0
2,Acer,Liquid Z6,Android,1.000,120.0,True,Li-Ion,2000.0,4,1.25,8.0,2.0,8.0
3,Acer,Liquid X2,Android,3.000,230.0,True,Li-Po,4020.0,8,1.30,13.0,13.0,32.0
4,Acer,Liquid Zest Plus,Android,2.000,200.0,False,Li-Ion,5000.0,4,1.30,13.0,5.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,ZTE,Grand X LTE T82,Android,1.000,340.0,True,Li-Ion,1900.0,2,1.50,8.0,1.3,4.0
1877,ZTE,Grand X V970,Android,1.000,230.0,True,Li-Ion,1600.0,1,1.00,5.0,0.6,4.0
1878,ZTE,Optik,Android,1.000,230.0,False,Li-Ion,4000.0,2,1.20,5.0,2.0,16.0
1879,ZTE,Light Tab 2 V9A,Android,0.512,250.0,False,Li-Ion,3400.0,1,1.40,3.2,0.6,4.0


#### Codificación de variables categoricas

In [5]:
columns = ['brand', 'OS', 'battery_removable', 'battery_type']

brand_encoder = preprocessing.LabelEncoder()
os_encoder = preprocessing.LabelEncoder()
battery_removable_encoder = preprocessing.LabelEncoder()
battery_type_encoder = preprocessing.LabelEncoder()


brand_df = df[['brand']].apply(brand_encoder.fit_transform)
os_df = df[['OS']].apply(os_encoder.fit_transform)
battery_removable_df = df[['battery_removable']].apply(battery_removable_encoder.fit_transform)
battery_type_df = df[['battery_type']].apply(battery_type_encoder.fit_transform)

df = df.drop(columns, axis=1)

df = df.join(brand_df)
df = df.join(os_df)
df = df.join(battery_removable_df)
df = df.join(battery_type_df)
df

Unnamed: 0,model,RAM,approx_price_EUR,battery_mah,CPU_cores,CPU_speed,primary_camera_mp,secondary_camera_mp,internal_memory_gb,brand,OS,battery_removable,battery_type
0,Iconia Talk S,2.000,170.0,3400.0,4,1.30,13.0,2.0,32.0,0,0,0,0
1,Liquid Z6 Plus,3.000,250.0,4080.0,8,1.30,13.0,5.0,32.0,0,0,1,1
2,Liquid Z6,1.000,120.0,2000.0,4,1.25,8.0,2.0,8.0,0,0,1,0
3,Liquid X2,3.000,230.0,4020.0,8,1.30,13.0,13.0,32.0,0,0,1,1
4,Liquid Zest Plus,2.000,200.0,5000.0,4,1.30,13.0,5.0,16.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,Grand X LTE T82,1.000,340.0,1900.0,2,1.50,8.0,1.3,4.0,56,0,1,0
1877,Grand X V970,1.000,230.0,1600.0,1,1.00,5.0,0.6,4.0,56,0,1,0
1878,Optik,1.000,230.0,4000.0,2,1.20,5.0,2.0,16.0,56,0,0,0
1879,Light Tab 2 V9A,0.512,250.0,3400.0,1,1.40,3.2,0.6,4.0,56,0,0,0


#### Obtención de datos para entrenamiento

In [6]:
features = ['RAM','approx_price_EUR', 'battery_mah', 'CPU_cores', 'CPU_speed',
            'internal_memory_gb', 'brand', 'OS', 'battery_removable','battery_type', 'primary_camera_mp', 'secondary_camera_mp']

In [7]:
x = df[features]

In [8]:
x

Unnamed: 0,RAM,approx_price_EUR,battery_mah,CPU_cores,CPU_speed,internal_memory_gb,brand,OS,battery_removable,battery_type,primary_camera_mp,secondary_camera_mp
0,2.000,170.0,3400.0,4,1.30,32.0,0,0,0,0,13.0,2.0
1,3.000,250.0,4080.0,8,1.30,32.0,0,0,1,1,13.0,5.0
2,1.000,120.0,2000.0,4,1.25,8.0,0,0,1,0,8.0,2.0
3,3.000,230.0,4020.0,8,1.30,32.0,0,0,1,1,13.0,13.0
4,2.000,200.0,5000.0,4,1.30,16.0,0,0,0,0,13.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1876,1.000,340.0,1900.0,2,1.50,4.0,56,0,1,0,8.0,1.3
1877,1.000,230.0,1600.0,1,1.00,4.0,56,0,1,0,5.0,0.6
1878,1.000,230.0,4000.0,2,1.20,16.0,56,0,0,0,5.0,2.0
1879,0.512,250.0,3400.0,1,1.40,4.0,56,0,0,0,3.2,0.6


## categorical_features = [6,7,8,9]

In [9]:
#x = x.values

#### Creación del modelo

In [9]:
categorical_features = [6,7,8,9]

In [12]:
kmeans = KMeans(n_clusters = 4, random_state=42)

In [13]:
kmeans.fit(x)

KMeans(n_clusters=4, random_state=42)

In [14]:
score = silhouette_score(x, kmeans.labels_).round(4)
print("silhouette_score = {}".format(score))

silhouette_score = 0.5899


In [15]:
clusters = kmeans.predict(x)

In [16]:
df['cluster'] = clusters

In [17]:
df

Unnamed: 0,model,RAM,approx_price_EUR,battery_mah,CPU_cores,CPU_speed,primary_camera_mp,secondary_camera_mp,internal_memory_gb,brand,OS,battery_removable,battery_type,cluster
0,Iconia Talk S,2.000,170.0,3400.0,4,1.30,13.0,2.0,32.0,0,0,0,0,2
1,Liquid Z6 Plus,3.000,250.0,4080.0,8,1.30,13.0,5.0,32.0,0,0,1,1,2
2,Liquid Z6,1.000,120.0,2000.0,4,1.25,8.0,2.0,8.0,0,0,1,0,0
3,Liquid X2,3.000,230.0,4020.0,8,1.30,13.0,13.0,32.0,0,0,1,1,2
4,Liquid Zest Plus,2.000,200.0,5000.0,4,1.30,13.0,5.0,16.0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,Grand X LTE T82,1.000,340.0,1900.0,2,1.50,8.0,1.3,4.0,56,0,1,0,0
1877,Grand X V970,1.000,230.0,1600.0,1,1.00,5.0,0.6,4.0,56,0,1,0,0
1878,Optik,1.000,230.0,4000.0,2,1.20,5.0,2.0,16.0,56,0,0,0,2
1879,Light Tab 2 V9A,0.512,250.0,3400.0,1,1.40,3.2,0.6,4.0,56,0,0,0,2


#### Almacenamiento del dataset con clusters

In [18]:
df.to_csv('../Data/final_datasetKmeans.csv', index=False)

In [18]:
df

Unnamed: 0,model,RAM,approx_price_EUR,battery_mah,CPU_cores,CPU_speed,primary_camera_mp,secondary_camera_mp,internal_memory_gb,brand,OS,battery_removable,battery_type,cluster
0,Iconia Talk S,2.000,170.0,3400.0,4,1.30,13.0,2.0,32.0,0,0,0,0,3
1,Liquid Z6 Plus,3.000,250.0,4080.0,8,1.30,13.0,5.0,32.0,0,0,1,1,3
2,Liquid Z6,1.000,120.0,2000.0,4,1.25,8.0,2.0,8.0,0,0,1,0,1
3,Liquid X2,3.000,230.0,4020.0,8,1.30,13.0,13.0,32.0,0,0,1,1,3
4,Liquid Zest Plus,2.000,200.0,5000.0,4,1.30,13.0,5.0,16.0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1876,Grand X LTE T82,1.000,340.0,1900.0,2,1.50,8.0,1.3,4.0,56,0,1,0,1
1877,Grand X V970,1.000,230.0,1600.0,1,1.00,5.0,0.6,4.0,56,0,1,0,1
1878,Optik,1.000,230.0,4000.0,2,1.20,5.0,2.0,16.0,56,0,0,0,3
1879,Light Tab 2 V9A,0.512,250.0,3400.0,1,1.40,3.2,0.6,4.0,56,0,0,0,3


#### Almacenamiento del modelo

In [19]:
CLUSTERING_MODEL_PATH = '../Models/clustering_model.pickle'

In [20]:
pickle.dump(kprototypes, open(CLUSTERING_MODEL_PATH, 'wb'))

#### Almacenamiento de encoders

In [21]:
pickle.dump(brand_encoder, open('../Models/brand_encoder.pickle', 'wb'))
np.save('../Models/brand_encoder.npy', brand_encoder.classes_)

pickle.dump(os_encoder , open('../Models/os_encoder.pickle', 'wb'))
np.save('../Models/os_encoder.npy', os_encoder.classes_)


pickle.dump(battery_removable_encoder , open('../Models/battery_removable_encoder.pickle', 'wb'))
np.save('../Models/battery_removable_encoder.npy', battery_removable_encoder.classes_)

pickle.dump(battery_type_encoder, open('../Models/battery_type_encoder.pickle', 'wb'))
np.save('../Models/battery_type_encoder.npy', battery_type_encoder.classes_)