In [20]:
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


In [21]:
df = pd.read_csv('../../data/devices-products-small.csv')
df.shape

(87, 2)

In [9]:
df['Category'].value_counts()

Category
computers    36
tablets      29
phones       22
Name: count, dtype: int64

In [15]:
x = df['Product'].values
y = df['Category'].values

x_train, x_test, y_train, y_test = train_test_split(x, y)

tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(x_train)
tfidf_test_vectors = tfidf_vectorizer.transform(x_test)

In [16]:
tokens = tfidf_vectorizer.get_feature_names_out()
tokens

array(['11', '12', '13', '15', '1st', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2nd', '3rd', '4th', '5th', '6th',
       '7th', '8th', '9th', 'air', 'early', 'edge', 'four', 'galaxy',
       'generation', 'ii', 'inch', 'ipad', 'late', 'macbook', 'mid',
       'mini', 'ports', 'pro', 'retina', 's10', 's20', 's21', 's5', 's6',
       's7', 's8', 's9', 'samsung', 'thunderbolt', 'two', 'ultra'],
      dtype=object)

In [13]:
pd.DataFrame(data=tfidf_train_vectors.toarray(), columns=tokens)

Unnamed: 0,10,11,12,13,14,15,1st,2006,2007,2008,...,s4,s5,s6,s7,s8,s9,samsung,thunderbolt,two,ultra
0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
2,0.0,0.0,0.0,0.339070,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.386574,0.000000,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.439978,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.0,0.0,0.0,0.319223,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.363947,0.435435,0.0
61,0.0,0.0,0.0,0.000000,0.0,0.450849,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
62,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.532273,0.0,0.0,0.299153,0.000000,0.000000,0.0
63,0.0,0.0,0.0,0.317693,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.362203,0.433348,0.0


In [18]:
clf = RandomForestClassifier()
clf.fit(tfidf_train_vectors, y_train)
y_pred = clf.predict(tfidf_test_vectors)

df_compare = pd.DataFrame(
    data={
        'product': x_test,
        'predicted_category': y_pred,
        'real_category': y_test
    },
    columns=['product', 'predicted_category', 'real_category']
)
df_compare

Unnamed: 0,product,predicted_category,real_category
0,"MacBook Pro (16-inch, 2019)",computers,computers
1,iPad Mini,tablets,tablets
2,"MacBook Pro (Retina, 13-inch, Early 2013)",computers,computers
3,MacBook Pro (Late 2012),computers,computers
4,"MacBook Pro (14-inch, 2021)",computers,computers
5,"MacBook Pro (15-inch, Mid 2017)",computers,computers
6,"MacBook Pro (16-inch, 2021)",computers,computers
7,Samsung Galaxy S10 (2019),phones,phones
8,Samsung Galaxy S7 Edge (2016),phones,phones
9,Samsung Galaxy S10e (2019),phones,phones


In [34]:
accuracy_score(y_test, y_pred)

1.0

In [31]:
future_x_test = tfidf_vectorizer.transform(['Sansun Galaxy S39', 'iPad Future'])

future_y_pred = clf.predict(future_x_test)
future_y_pred

array(['phones', 'tablets'], dtype=object)