# Mercari NLP

In [1]:
#Standard imports
import pandas as pd
import os
import sys
import string
import re
import numpy as np

#Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#SKlearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#NLTK
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer, sent_tokenize
from nltk import WordNetLemmatizer # lemmatizer using WordNet
from nltk.corpus import wordnet # imports WordNet
from nltk import pos_tag # nltk's native part of speech tagging

## Preproccessing

In [2]:
train_df = pd.read_csv("/Users/javm/Desktop/Mercari-Price-Prediction-Project/train.tsv", sep = '\t')

In [67]:
train_df = train_df.groupby('category_name').filter(lambda x : len(x)>5000)

In [68]:
# Turn 'brand name' to 0s and 1s 

In [69]:
train_df['brand_name'].isna().value_counts()

False    966234
Name: brand_name, dtype: int64

In [6]:
# Consider removing category_name values that sum up to less than 50? 150?

In [7]:
train_df = train_df.groupby('category_name').filter(lambda x : len(x)>100)

In [8]:
train_df['category_name'] = train_df['category_name'].str.replace('/', ' ')

In [9]:
# Dropped less than 10 rows
train_df.dropna(subset = ['item_description'], inplace = True)

In [10]:
train_df

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men Tops T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics Computers & Tablets Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women Jewelry Necklaces,,44.0,0,Complete with certificate of authenticity
...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Women Dresses Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."
1482531,1482531,Little mermaid handmade dress,2,Kids Girls 2T-5T Dresses,Disney,14.0,0,Little mermaid handmade dress never worn size 2t
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors Exercise Fitness accessories,,12.0,0,"Used once or twice, still in great shape."
1482533,1482533,World markets lanterns,3,Home Home Décor Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...


In [11]:
train_df.isna().value_counts()

train_id  name   item_condition_id  category_name  brand_name  price  shipping  item_description
False     False  False              False          False       False  False     False               842591
                                                   True        False  False     False               618478
dtype: int64

In [12]:
# Dropped .4%. Not 40%, but .4% as in .004
train_df.dropna(subset = ['category_name'],inplace = True)

In [13]:
train_df['brand_name'] = train_df['brand_name'].fillna(0)

In [14]:
train_df['brand_mention'] = np.where(train_df['brand_name']!= 0, True,False)
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_mention
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men Tops T-shirts,0,10.0,1,No description yet,False
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics Computers & Tablets Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,True
2,2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,True
3,3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,0,35.0,1,New with tags. Leather horses. Retail for [rm]...,False
4,4,24K GOLD plated rose,1,Women Jewelry Necklaces,0,44.0,0,Complete with certificate of authenticity,False


In [15]:
one_hot_df = pd.get_dummies(train_df, prefix="brand_mention", 
                            columns=["brand_mention"], 
                            drop_first=True)

In [16]:
one_hot_df.drop(columns = ['brand_name'], axis = 1, inplace = True)

In [17]:
one_hot_df

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men Tops T-shirts,10.0,1,No description yet,0
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics Computers & Tablets Components & P...,52.0,0,This keyboard is in great condition and works ...,1
2,2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,10.0,1,Adorable top with a hint of lace and a key hol...,1
3,3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,35.0,1,New with tags. Leather horses. Retail for [rm]...,0
4,4,24K GOLD plated rose,1,Women Jewelry Necklaces,44.0,0,Complete with certificate of authenticity,0
...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Women Dresses Mid-Calf,20.0,1,"Lace, says size small but fits medium perfectl...",1
1482531,1482531,Little mermaid handmade dress,2,Kids Girls 2T-5T Dresses,14.0,0,Little mermaid handmade dress never worn size 2t,1
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors Exercise Fitness accessories,12.0,0,"Used once or twice, still in great shape.",0
1482533,1482533,World markets lanterns,3,Home Home Décor Home Décor Accents,45.0,1,There is 2 of each one that you see! So 2 red ...,0


In [18]:
sample_df = one_hot_df.sample(n=100000)

In [19]:
sample_df

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True
783681,783681,6.5 pink keds,3,Women Shoes Fashion Sneakers,14.0,0,Womens pink keds sz 6.5 barely worn. Bundle wi...,0
1392308,1392308,"Calvin Klein White Bra Bralette, Sz M",1,Women Athletic Apparel Sports Bras,14.0,1,Brand: Calvin Klein Size: M Color: White Mate...,1
1173633,1173633,LipSense moisturizing lip balm,1,Beauty Makeup Lips,24.0,1,LipSense moisturizing lip balm Never been open...,1
898239,898239,Mardi Gras/Festival Mask,1,Women Jewelry Necklaces,8.0,1,Masquerade pendant on an approximate 18 inch c...,0
349202,349202,White Nike hoodie,3,Women Sweaters Hooded,16.0,0,Only worn a few times. Really cozy.,1
...,...,...,...,...,...,...,...,...
898214,898214,Sea Glass Fish Earrings Handmade,1,Women Jewelry Earrings,23.0,0,I love to make jewelry and want to share my cr...,0
1481510,1481510,Black low top women's size 9 converses,3,Women Shoes Athletic,20.0,1,Worn a couple times and a few scratches and fl...,1
841031,841031,Matte lipsense gloss,1,Beauty Makeup Lips,22.0,1,6 matte glosses [rm] each Will not ship to Ala...,1
958767,958767,Carter's Baby Girl Rompers Size 9 Months,1,Kids Girls 0-24 Mos One-Pieces,36.0,0,Lot of 7 sets. All new with tags. Size 9 month...,1


In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from nltk import WordNetLemmatizer # lemmatizer using WordNet
from nltk.corpus import wordnet # imports WordNet
from nltk import pos_tag # nltk's native part of speech tagging

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [23]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        
        pass
    
    def fit(self, data, y = 0):
        
        return self
    
    def transform(self, data, y = 0):
        fully_normalized_corpus = data.apply(self.process_doc)
        
        return fully_normalized_corpus
        
    def process_doc(self, doc):

        wnl = WordNetLemmatizer()
        stop_words = stopwords.words('english')
        
        def pos_tagger(nltk_tag):
            if nltk_tag.startswith('J'):
                return wordnet.ADJ
            elif nltk_tag.startswith('V'):
                return wordnet.VERB
            elif nltk_tag.startswith('N'):
                return wordnet.NOUN
            elif nltk_tag.startswith('R'):
                return wordnet.ADV
            else:         
                return None

        doc_norm = [tok.lower() for tok in word_tokenize(doc) if ((tok.isalpha()) & (tok not in stop_words)) ]

        wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(doc_norm))) 
        doc_norm = [wnl.lemmatize(token, pos) for token, pos in wordnet_tagged if pos is not None]

        return " ".join(doc_norm)

In [24]:
proc = TextPreprocessor()

In [25]:
df1 = proc.fit_transform(sample_df['item_description'])

In [26]:
df2 = proc.fit_transform(sample_df['category_name'])

In [27]:
df3 = proc.fit_transform(sample_df['name'])

In [28]:
df12 = df1.str.cat(df2, sep = ' ')

In [29]:
df123 = df12.str.cat(df3, sep = ' ')

In [30]:
df123

783681     woman pink keds sz barely worn bundle listing ...
1392308    brand calvin klein size m color white material...
1173633    lipsense moisturize lip balm never open still ...
898239     masquerade pendant approximate inch chain silv...
349202     only worn time really cozy woman sweater hood ...
                                 ...                        
898214     i love make jewelry want share creation woman ...
1481510    worn couple time scratch flaw still good condi...
841031     matte gloss rm ship alaska hawaii beauty makeu...
958767     lot set new tag size month sell lot seperate s...
957593     new tag oshkosh cat shoe size kid euro size fl...
Name: item_description, Length: 100000, dtype: object

In [31]:
#total_description_df= pd.Series(["df123"],name="total_description")
total_description_df = df123.to_frame()

In [32]:
total_description_df.rename(columns ={'item_description' : 'total_description'}, inplace = True)

In [33]:
total_description_df

Unnamed: 0,total_description
783681,woman pink keds sz barely worn bundle listing ...
1392308,brand calvin klein size m color white material...
1173633,lipsense moisturize lip balm never open still ...
898239,masquerade pendant approximate inch chain silv...
349202,only worn time really cozy woman sweater hood ...
...,...
898214,i love make jewelry want share creation woman ...
1481510,worn couple time scratch flaw still good condi...
841031,matte gloss rm ship alaska hawaii beauty makeu...
958767,lot set new tag size month sell lot seperate s...


In [34]:
total_description_df.to_csv('vec_df',index=False)

In [35]:
vec_df = pd.read_csv('/Users/javm/Desktop/Mercari-Price-Prediction-Project/vec_df')

In [36]:
vec_df

Unnamed: 0,total_description
0,woman pink keds sz barely worn bundle listing ...
1,brand calvin klein size m color white material...
2,lipsense moisturize lip balm never open still ...
3,masquerade pendant approximate inch chain silv...
4,only worn time really cozy woman sweater hood ...
...,...
99995,i love make jewelry want share creation woman ...
99996,worn couple time scratch flaw still good condi...
99997,matte gloss rm ship alaska hawaii beauty makeu...
99998,lot set new tag size month sell lot seperate s...


## Import Merged Text

In [37]:
df = pd.concat([sample_df, total_description_df], axis = 1, join = 'inner')
df

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True,total_description
783681,783681,6.5 pink keds,3,Women Shoes Fashion Sneakers,14.0,0,Womens pink keds sz 6.5 barely worn. Bundle wi...,0,woman pink keds sz barely worn bundle listing ...
1392308,1392308,"Calvin Klein White Bra Bralette, Sz M",1,Women Athletic Apparel Sports Bras,14.0,1,Brand: Calvin Klein Size: M Color: White Mate...,1,brand calvin klein size m color white material...
1173633,1173633,LipSense moisturizing lip balm,1,Beauty Makeup Lips,24.0,1,LipSense moisturizing lip balm Never been open...,1,lipsense moisturize lip balm never open still ...
898239,898239,Mardi Gras/Festival Mask,1,Women Jewelry Necklaces,8.0,1,Masquerade pendant on an approximate 18 inch c...,0,masquerade pendant approximate inch chain silv...
349202,349202,White Nike hoodie,3,Women Sweaters Hooded,16.0,0,Only worn a few times. Really cozy.,1,only worn time really cozy woman sweater hood ...
...,...,...,...,...,...,...,...,...,...
898214,898214,Sea Glass Fish Earrings Handmade,1,Women Jewelry Earrings,23.0,0,I love to make jewelry and want to share my cr...,0,i love make jewelry want share creation woman ...
1481510,1481510,Black low top women's size 9 converses,3,Women Shoes Athletic,20.0,1,Worn a couple times and a few scratches and fl...,1,worn couple time scratch flaw still good condi...
841031,841031,Matte lipsense gloss,1,Beauty Makeup Lips,22.0,1,6 matte glosses [rm] each Will not ship to Ala...,1,matte gloss rm ship alaska hawaii beauty makeu...
958767,958767,Carter's Baby Girl Rompers Size 9 Months,1,Kids Girls 0-24 Mos One-Pieces,36.0,0,Lot of 7 sets. All new with tags. Size 9 month...,1,lot set new tag size month sell lot seperate s...


In [39]:
ready_df = df.drop(['name', 'category_name', 'item_description'], axis=1)

ready_df

Unnamed: 0,train_id,item_condition_id,price,shipping,brand_mention_True,total_description
783681,783681,3,14.0,0,0,woman pink keds sz barely worn bundle listing ...
1392308,1392308,1,14.0,1,1,brand calvin klein size m color white material...
1173633,1173633,1,24.0,1,1,lipsense moisturize lip balm never open still ...
898239,898239,1,8.0,1,0,masquerade pendant approximate inch chain silv...
349202,349202,3,16.0,0,1,only worn time really cozy woman sweater hood ...
...,...,...,...,...,...,...
898214,898214,1,23.0,0,0,i love make jewelry want share creation woman ...
1481510,1481510,3,20.0,1,1,worn couple time scratch flaw still good condi...
841031,841031,1,22.0,1,1,matte gloss rm ship alaska hawaii beauty makeu...
958767,958767,1,36.0,0,1,lot set new tag size month sell lot seperate s...


In [45]:
prc_steps = [('tfid', TfidfVectorizer(max_features = 2000))]
tfid_pipeline = Pipeline(prc_steps)

In [46]:
processed_text = tfid_pipeline.fit_transform(df['total_description'])

In [47]:
processed_text

<100000x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 1643546 stored elements in Compressed Sparse Row format>

In [48]:
# feat_names = tfid_pipeline['tfid'].get_feature_names()

word_vec = pd.DataFrame(processed_text.toarray())


In [49]:
word_vec

Unnamed: 0,abercrombie,abh,able,absolutely,accent,accept,access,accessory,acid,acne,...,yoga,york,young,younique,youth,zara,zebra,zip,zipper,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
ready_df

Unnamed: 0,train_id,item_condition_id,price,shipping,brand_mention_True,total_description
783681,783681,3,14.0,0,0,woman pink keds sz barely worn bundle listing ...
1392308,1392308,1,14.0,1,1,brand calvin klein size m color white material...
1173633,1173633,1,24.0,1,1,lipsense moisturize lip balm never open still ...
898239,898239,1,8.0,1,0,masquerade pendant approximate inch chain silv...
349202,349202,3,16.0,0,1,only worn time really cozy woman sweater hood ...
...,...,...,...,...,...,...
898214,898214,1,23.0,0,0,i love make jewelry want share creation woman ...
1481510,1481510,3,20.0,1,1,worn couple time scratch flaw still good condi...
841031,841031,1,22.0,1,1,matte gloss rm ship alaska hawaii beauty makeu...
958767,958767,1,36.0,0,1,lot set new tag size month sell lot seperate s...


In [50]:
forest_df = pd.concat([ready_df, word_vec], axis =1,join='inner')

In [51]:
forest_df.drop(['total_description'], axis = 1,inplace = True)

In [64]:
forest_df.to_csv('sample_before_train_test', index = False)

In [65]:
forest_df

Unnamed: 0,train_id,item_condition_id,price,shipping,brand_mention_True,abercrombie,abh,able,absolutely,accent,...,yoga,york,young,younique,youth,zara,zebra,zip,zipper,zoom
31034,31034,2,75.0,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
79100,79100,1,34.0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
59984,59984,1,7.0,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
93087,93087,3,44.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
56939,56939,3,8.0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14470,14470,1,34.0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
35803,35803,1,24.0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
42804,42804,1,6.0,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205862,0.0
42624,42624,2,19.0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [54]:
from sklearn.model_selection import train_test_split

y = forest_df['price']
X = forest_df.drop(columns = ['price'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [55]:
from sklearn.ensemble import RandomForestRegressor

In [56]:
rf = RandomForestRegressor(random_state = 42)

In [57]:
rf.fit(X_train,y_train)

RandomForestRegressor(random_state=42)

In [58]:
rf.score(X_train,y_train)

0.8509512874993008

In [59]:
rf.score(X_test,y_test)

-0.05300844117909326

In [62]:
y_pred = rf.predict(X_test)

In [60]:
from sklearn.metrics import classification_report

In [63]:
print(classification_report(y_test, y_pred))

ValueError: continuous-multioutput is not supported

 ## Define Sigmoid

In [None]:
def sigmoid(z):
    s = 1/1(1+np.exp(-z))
    
    return s

def initialize_with_zeros(dim):
    w = np.zeros(shape=(dim, 1))
    b = 0
    
    assert(s.shape == (dim, 1))
    assert(isinstance(b,float) or isinstance(b,int))
    
    return w,b

In [None]:
X_train.shape

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# Original OG CNN
model = Sequential()
model.add(Input(shape= (number of dimensions in data)))
model.add(Dense(128, activation=‘relu’))
model_reg.add(Dropout(0.2))
model.add(Dense(128, activation=‘relu’))
model.add(Dense(64, activation=‘relu’))
model_reg.add(Dropout(0.2))
model.add(Dense(32, activation=‘relu’))
model.add(Dense(1, activation=‘sigmoid’))
model.compile(loss=‘binary_crossentropy’,
              optimizer=‘adam’, metrics=‘accuracy’)
model.summary()


In [None]:
#Feature of CNN if not enough information by 'patience' epocs, shut it down
trainCallback = EarlyStopping(monitor=‘loss’, min_delta = 1e-6, patience = 20)

In [None]:
# Fit into CNN   
base = model.fit(X_train,  y_train, epochs = 50, 
                 batch_size = 128, validation_size=0.2 , callbacks=[trainCallback], verbose =1)

In [None]:
#Graphing   
sns.set()
acc = base.history[‘accuracy’]
val = base.history[‘val_accuracy’]
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, ‘-’, label=‘Training accuracy’)
plt.plot(epochs, val, ‘:’, label=‘Validation accuracy’)
plt.title(‘Training and Validation Accuracy’)
plt.xlabel(‘Epoch’)
plt.ylabel(‘Accuracy’)
plt.legend(loc=‘lower right’)
plt.plot()

In [None]:
#this is optimized for a binary classifier and is a cnn, to do a baseline i would just have one hidden layer:
#more simpler model:
model = Sequential()
model.add(Input(shape= (number of dimensions in data)))
model.add(Dense(128, activation=‘relu’))
model.add(Dense(1, activation=‘sigmoid’))
model.compile(loss=‘binary_crossentropy’,
              optimizer=‘adam’, metrics=‘accuracy’)
model.summary()