# CLASSIFIER

In [30]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from openai import OpenAI
from ast import literal_eval

import pandas as pd
import numpy as np
import os
from pathlib import Path


### Leemos los datos y separamos para obtener 2 set de datos un train-test y otro de validacion

In [31]:
df1=pd.read_csv("D:\Clustering_MeLi\src\data\processed\data_cluster_m1.csv", sep="|")
df2=pd.read_csv("D:\Clustering_MeLi\src\data\processed\data_cluster_m2.csv", sep="|")
dfEmb1=pd.read_csv("D:\Clustering_MeLi\src\data\processed\Embedding_test_meli.csv")
dfEmb2=pd.read_csv("D:\Clustering_MeLi\src\data\processed\Embedding_test_meli_2.csv")



In [32]:
dfEmb1["embedding"].shape

(2000,)

In [33]:
dfEmb1["embedding"] = dfEmb1.embedding.apply(literal_eval).apply(np.array)
dfEmb2["embedding"] = dfEmb2.embedding.apply(literal_eval).apply(np.array)

In [34]:
df1 = pd.merge(left=dfEmb1,right=df1[["seller_nickname","cluster"]],how="left",on='seller_nickname')
df2 = pd.merge(left=dfEmb2,right=df2[["seller_nickname","cluster"]],how="left",on='seller_nickname')
print("df1: ",df1.shape)
print("df2: ",df2.shape)

df1:  (2000, 16)
df2:  (2000, 17)


In [35]:
print(df1.columns)
print(df2.columns)

Index(['Unnamed: 0', 'seller_nickname', 'products', 'stock', 'price',
       'seller_reputation', 'Lt', 'condition', 'isrefu', 'category_id',
       'regular_price1', 'pxp', 'mix', 'tokens', 'embedding', 'cluster'],
      dtype='object')
Index(['Unnamed: 0', 'seller_nickname', 'products', 'stock', 'price',
       'seller_reputation', 'Lt', 'condition', 'isrefu', 'category_id',
       'regular_price1', 'pxp', 'mix', 'tokens', 'totalcat', 'embedding',
       'cluster'],
      dtype='object')


In [36]:
df1x = df1[:-200]
df2x = df2[:-200]
print("df1x: ",df1x.shape)
print("df2x: ",df2x.shape)

df1x:  (1800, 16)
df2x:  (1800, 17)


In [37]:
df1x.index.values

array([   0,    1,    2, ..., 1797, 1798, 1799], shape=(1800,))

In [38]:
m1x=np.vstack(df1x["embedding"].values)
m2x=np.vstack(df2x["embedding"].values)
print("m1x: ",m1x.shape)
print("m2x: ",m2x.shape)

m1x:  (1800, 1536)
m2x:  (1800, 1536)


In [39]:
df1v = df1[-200:]
df2v = df2[-200:]
print("df1x: ",df1v.shape)
print("df2x: ",df2v.shape)

df1x:  (200, 16)
df2x:  (200, 17)


In [40]:
df1v.index.values

array([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,
       1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821,
       1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832,
       1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843,
       1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854,
       1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865,
       1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876,
       1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887,
       1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898,
       1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909,
       1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920,
       1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
       1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
       1943, 1944, 1945, 1946, 1947, 1948, 1949, 19

In [41]:
m1v=np.vstack(df1v["embedding"].values)
m2v=np.vstack(df2v["embedding"].values)
print("m1v: ",m1v.shape)
print("m2v: ",m2v.shape)

m1v:  (200, 1536)
m2v:  (200, 1536)


### Preprocesamiento de datos

In [43]:
#conjunto de datos para train test
#

x_train,x_test,y_train,y_test = train_test_split((m1x),df1x.cluster, train_size=0.8, random_state= 13)

### Training y Test: M1

In [44]:
# RandomForest

RDFC = RandomForestClassifier(random_state=13)
RDFC.fit(x_train,y_train)
predict = RDFC.predict(x_test)
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        50
           1       0.94      0.99      0.97       162
           2       1.00      0.89      0.94        36
           3       0.99      0.94      0.96        87
           4       1.00      1.00      1.00        25

    accuracy                           0.97       360
   macro avg       0.99      0.96      0.97       360
weighted avg       0.97      0.97      0.97       360



In [None]:
#con conjunto de Validacion
predict_val = RDFC.predict(m1v)
print(classification_report(df1v.cluster,predict_val))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        40
           1       0.97      1.00      0.98        84
           2       1.00      0.95      0.98        21
           3       0.95      0.95      0.95        40
           4       1.00      1.00      1.00        15

    accuracy                           0.97       200
   macro avg       0.98      0.97      0.98       200
weighted avg       0.98      0.97      0.97       200



In [51]:
#cross validation

validation = cross_val_score(RDFC,m1x,df1x.cluster,cv=6, scoring='accuracy')
print((validation))
print(np.mean(validation))

[0.96333333 0.95666667 0.96333333 0.96333333 0.97333333 0.96      ]
0.9633333333333334


### Support Vector Machine

In [46]:
SVMC = SVC(C=0.1, random_state=13)
SVMC.fit(x_train,y_train)
predicts=SVMC.predict(x_test)
print(classification_report(y_test,predict))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        50
           1       0.94      0.99      0.97       162
           2       1.00      0.89      0.94        36
           3       0.99      0.94      0.96        87
           4       1.00      1.00      1.00        25

    accuracy                           0.97       360
   macro avg       0.99      0.96      0.97       360
weighted avg       0.97      0.97      0.97       360



In [47]:
predict_val = SVMC.predict(m1v)
print(classification_report(df1v.cluster,predict_val))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        40
           1       0.92      1.00      0.96        84
           2       0.94      0.81      0.87        21
           3       0.93      0.95      0.94        40
           4       1.00      0.80      0.89        15

    accuracy                           0.94       200
   macro avg       0.96      0.90      0.93       200
weighted avg       0.95      0.94      0.94       200



In [52]:
#cross validation
validation = cross_val_score(SVMC,m1x,df1x.cluster,cv=6, scoring='accuracy')
print((validation))
print(np.mean(validation))

[0.94       0.94       0.95333333 0.94333333 0.95333333 0.95      ]
0.9466666666666667
