In [29]:
## This treatment applies what was found here:
## http://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
## The method is Kavita Ganesan

In [69]:
import numpy as np
import pandas as pd
import random
import spacy
import scipy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import scipy.sparse as sp
from sklearn.preprocessing import normalize

In [31]:
import re
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [32]:
e_commerce = pd.read_csv('e_commerce.csv',  encoding='latin-1')
e_commerce['product_description'].fillna('Description not available', inplace=True)
e_commerce['product_title'].fillna('title not available', inplace=True)

In [33]:
e_commerce.head()

Unnamed: 0,_unit_id,relevance,relevance:variance,product_image,product_link,product_price,product_title,query,rank,source,url,product_description
0,711158459,3.67,0.471,http://thumbs2.ebaystatic.com/d/l225/m/mzvzEUI...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$329.98,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,1,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
1,711158460,4.0,0.0,http://thumbs3.ebaystatic.com/d/l225/m/mJNDmSy...,http://www.ebay.com/itm/Sony-PlayStation-4-Lat...,$324.84,Sony PlayStation 4 (Latest Model)- 500 GB Jet ...,playstation 4,2,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
2,711158461,4.0,0.0,http://thumbs4.ebaystatic.com/d/l225/m/m10NZXA...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$324.83,Sony PlayStation 4 PS4 500 GB Jet Black Console,playstation 4,3,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...
3,711158462,3.67,0.471,http://thumbs2.ebaystatic.com/d/l225/m/mZZXTmA...,http://www.ebay.com/itm/Sony-PlayStation-4-500...,$350.00,Sony - PlayStation 4 500GB The Last of Us Rema...,playstation 4,4,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,Description not available
4,711158463,3.33,0.471,http://thumbs3.ebaystatic.com/d/l225/m/mzvzEUI...,http://www.ebay.com/itm/Sony-PlayStation-4-PS4...,$308.00\nTrending at\n$319.99,Sony PlayStation 4 (PS4) (Latest Model)- 500 G...,playstation 4,5,eBay,http://www.ebay.com/sch/i.html?_from=R40&_trks...,The PlayStation 4 system opens the door to an ...


In [34]:
e_commerce.shape

(32671, 12)

In [35]:
percentage = 0.7
n_samples = int(percentage * e_commerce.shape[0])
indexes_train = random.sample(range(n_samples), n_samples)
X_train = e_commerce.iloc[e_commerce.index.isin(indexes_train)].copy()
X_test  = e_commerce.iloc[~e_commerce.index.isin(indexes_train)].copy()

In [36]:
X_train.shape

(22869, 12)

In [37]:
X_test.shape

(9802, 12)

In [38]:
e_commerce.groupby('product_title').count()['_unit_id'].value_counts()

1     27522
2      1946
3       194
4        83
5        14
6        11
7         6
11        3
15        2
10        2
9         2
8         2
12        1
20        1
16        1
Name: _unit_id, dtype: int64

In [39]:
## Start of the process

In [40]:
X_train['cleaned_description'] = X_train['product_description'].apply(lambda x:pre_process(x))
X_test['cleaned_description'] = X_test['product_description'].apply(lambda x:pre_process(x))

In [41]:
nlp = spacy.load('en')  # make sure to use larger model!

In [42]:
stopwords = nlp.Defaults.stop_words #Get the stopwords
docs=X_train['cleaned_description'].tolist()

In [43]:
cv=CountVectorizer(max_df=0.85,stop_words=stopwords)
word_count_vector=cv.fit_transform(docs)

In [44]:
word_count_vector.shape

(22869, 33354)

In [45]:
word_count_vector

<22869x33354 sparse matrix of type '<class 'numpy.int64'>'
	with 770171 stored elements in Compressed Sparse Row format>

In [46]:
list(cv.vocabulary_.keys())[:10]

['elegantmm',
 'chock',
 'urinary',
 'deducted',
 'lafem',
 'companypolicycontents',
 'mykonis',
 'multicam',
 'half',
 'jbl']

In [47]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [48]:
tfidf_transformer.idf_

array([ 4.65238689, 10.34443411, 10.34443411, ...,  4.66085434,
        4.66085434, 10.34443411])

In [49]:
## Extract keywords

In [50]:
len(docs)

22869

In [51]:
number_matrix_columns = tfidf_transformer.idf_.shape[0]
number_matrix_rows = len(docs)
type(number_matrix_columns)

int

In [52]:
docs_test = X_test['cleaned_description'].tolist()

In [55]:
#tf_idf_table_train = np.empty((number_matrix_rows, number_matrix_columns))
tf_idf_table_train = sp.csr_matrix((number_matrix_rows, number_matrix_columns)).tolil()
for i in range(len(docs)):
    tf_idf_table_train[i,:]=tfidf_transformer.transform(cv.transform([docs[i]]))

In [56]:
tf_idf_table_train = tf_idf_table_train.tocsr()

In [58]:
tf_idf_table_train

<22869x33354 sparse matrix of type '<class 'numpy.float64'>'
	with 770171 stored elements in Compressed Sparse Row format>

In [60]:
#u, s, vt = scipy.sparse.linalg.svds(tf_idf_table_train)

In [100]:
svd = TruncatedSVD(n_components=2000, n_iter=7, random_state=42)

In [101]:
svd.fit(tf_idf_table_train)

TruncatedSVD(algorithm='randomized', n_components=2000, n_iter=7,
       random_state=42, tol=0.0)

In [102]:
svd.components_.shape

(2000, 33354)

In [103]:
svd.explained_variance_ratio_.sum()

0.7933270915378761