In [3]:
import pandas as pd
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


In [4]:
df = pd.read_csv('../../data/devices-products-small.csv')
df.shape

(87, 2)

In [5]:
df['Category'].value_counts()

Category
computers    36
tablets      29
phones       22
Name: count, dtype: int64

In [6]:
x = df['Product'].values
y = df['Category'].values

x_train, x_test, y_train, y_test = train_test_split(x, y)

tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(x_train)
tfidf_test_vectors = tfidf_vectorizer.transform(x_test)

In [7]:
tokens = tfidf_vectorizer.get_feature_names_out()
tokens

array(['11', '12', '13', '15', '16', '1st', '2006', '2007', '2008',
       '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017', '2018', '2019', '2020', '2021', '2nd', '3rd', '4th', '5th',
       '6th', '7th', '8th', 'air', 'early', 'edge', 'four', 'galaxy',
       'generation', 'ii', 'inch', 'ipad', 'late', 'macbook', 'mid',
       'mini', 'ports', 'pro', 'retina', 's10', 's20', 's21', 's4', 's5',
       's6', 's7', 's8', 's9', 'samsung', 'thunderbolt', 'two', 'ultra'],
      dtype=object)

In [8]:
pd.DataFrame(data=tfidf_train_vectors.toarray(), columns=tokens)

Unnamed: 0,11,12,13,15,16,1st,2006,2007,2008,2009,...,s4,s5,s6,s7,s8,s9,samsung,thunderbolt,two,ultra
0,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.774529,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.376387,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.608791,0.359685,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.452489,0.00000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.000000,0.62464,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.0,0.0,0.400206,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
61,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
62,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
63,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


In [9]:
clf = RandomForestClassifier()
clf.fit(tfidf_train_vectors, y_train)
y_pred = clf.predict(tfidf_test_vectors)

df_compare = pd.DataFrame(
    data={
        'product': x_test,
        'predicted_category': y_pred,
        'real_category': y_test
    },
    columns=['product', 'predicted_category', 'real_category']
)
df_compare

Unnamed: 0,product,predicted_category,real_category
0,Samsung Galaxy S7 (2016),phones,phones
1,Samsung Galaxy S21 (2021),phones,phones
2,iPad Mini 4,tablets,tablets
3,iPad (9th generation),tablets,tablets
4,Samsung Galaxy S III (2012),phones,phones
5,"iPad Pro (11-inch, 3rd generation)",tablets,tablets
6,iPad Pro (10.5-inch),tablets,tablets
7,Samsung Galaxy S21+ (2021),phones,phones
8,"MacBook Pro (Retina, 13-inch, Late 2012)",computers,computers
9,"iPad Pro (12.9-inch, 3rd generation)",tablets,tablets


In [10]:
accuracy_score(y_test, y_pred)

1.0

In [11]:
future_x_test = tfidf_vectorizer.transform(['Sansun Galaxy S39', 'iPed Future'])

future_y_pred = clf.predict(future_x_test)
future_y_pred

array(['phones', 'tablets'], dtype=object)