# Mercari NLP

In [1]:
#Standard imports
import pandas as pd
import os
import sys
import string
import re
import numpy as np

#Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#SKlearn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#NLTK
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer, sent_tokenize
from nltk import WordNetLemmatizer # lemmatizer using WordNet
from nltk.corpus import wordnet # imports WordNet
from nltk import pos_tag # nltk's native part of speech tagging

## Preproccessing

In [2]:
train_df = pd.read_csv("/Users/javm/Desktop/Mercari-Price-Prediction-Project/train.tsv", sep = '\t')

In [3]:
# Consider removing category_name values that sum up to less than 5k
train_df = train_df.groupby('category_name').filter(lambda x : len(x)>5000)

In [4]:
# Turn 'brand name' to 0s and 1s 

In [5]:
train_df['brand_name'].isna().value_counts()

False    610094
True     356142
Name: brand_name, dtype: int64

In [6]:
train_df['category_name'] = train_df['category_name'].str.replace('/', ' ')

In [7]:
# Dropped less than 10 rows
train_df.dropna(subset = ['item_description'], inplace = True)

In [8]:
train_df

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men Tops T-shirts,,10.0,1,No description yet
2,2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women Jewelry Necklaces,,44.0,0,Complete with certificate of authenticity
5,5,Bundled items requested for Ruie,3,Women Other Other,,59.0,0,"Banana republic bottoms, Candies skirt with ma..."
...,...,...,...,...,...,...,...,...
1482526,1482526,Harry Potter Shirt! Women M/ Girl XL,2,Women Tops & Blouses T-Shirts,,12.0,0,"Great Harry Potter Shirt! ""Hogwarts, School of..."
1482527,1482527,Blk/white ribbed mock neck bodysuit M,1,Women Tops & Blouses Blouse,,10.0,1,Brand new black and white ribbed mock neck bod...
1482528,1482528,Victoria's Secret Tankini Sz. Large,2,Women Athletic Apparel Sports Bras,Victoria's Secret,18.0,1,Purple and Paisley Victoria's Secret Tankini S...
1482533,1482533,World markets lanterns,3,Home Home Décor Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...


In [9]:
# Dropped .4%. Not 40%, but .4% as in .004
train_df.dropna(subset = ['category_name'],inplace = True)

In [10]:
train_df['brand_name'].isna().value_counts()

False    610093
True     356141
Name: brand_name, dtype: int64

In [46]:
train_df['brand_name'] = train_df['brand_name'].fillna(0)

In [47]:
train_df['brand_mention'] = np.where(train_df['brand_name']!= 0, True,False)
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,brand_mention
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men Tops T-shirts,0,10.0,1,No description yet,False
2,2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,True
3,3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,0,35.0,1,New with tags. Leather horses. Retail for [rm]...,False
4,4,24K GOLD plated rose,1,Women Jewelry Necklaces,0,44.0,0,Complete with certificate of authenticity,False
5,5,Bundled items requested for Ruie,3,Women Other Other,0,59.0,0,"Banana republic bottoms, Candies skirt with ma...",False


In [48]:
one_hot_df = pd.get_dummies(train_df, prefix="brand_mention", 
                            columns=["brand_mention"], 
                            drop_first=True)

In [49]:
one_hot_df.drop(columns = ['brand_name'], axis = 1, inplace = True)

In [34]:
#Drop rows with no item description
one_hot_df['item_description'] = one_hot_df[one_hot_df["item_description"].str.contains("No description yet") == True]

In [50]:
one_hot_df

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men Tops T-shirts,10.0,1,No description yet,0
2,2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,10.0,1,Adorable top with a hint of lace and a key hol...,1
3,3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,35.0,1,New with tags. Leather horses. Retail for [rm]...,0
4,4,24K GOLD plated rose,1,Women Jewelry Necklaces,44.0,0,Complete with certificate of authenticity,0
5,5,Bundled items requested for Ruie,3,Women Other Other,59.0,0,"Banana republic bottoms, Candies skirt with ma...",0
...,...,...,...,...,...,...,...,...
1482526,1482526,Harry Potter Shirt! Women M/ Girl XL,2,Women Tops & Blouses T-Shirts,12.0,0,"Great Harry Potter Shirt! ""Hogwarts, School of...",0
1482527,1482527,Blk/white ribbed mock neck bodysuit M,1,Women Tops & Blouses Blouse,10.0,1,Brand new black and white ribbed mock neck bod...,0
1482528,1482528,Victoria's Secret Tankini Sz. Large,2,Women Athletic Apparel Sports Bras,18.0,1,Purple and Paisley Victoria's Secret Tankini S...,1
1482533,1482533,World markets lanterns,3,Home Home Décor Home Décor Accents,45.0,1,There is 2 of each one that you see! So 2 red ...,0


In [55]:
#Drop rows with no item description
data = one_hot_df[one_hot_df['item_description'].str.contains('No description yet') == False]

In [56]:
data

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True
2,2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,10.0,1,Adorable top with a hint of lace and a key hol...,1
3,3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,35.0,1,New with tags. Leather horses. Retail for [rm]...,0
4,4,24K GOLD plated rose,1,Women Jewelry Necklaces,44.0,0,Complete with certificate of authenticity,0
5,5,Bundled items requested for Ruie,3,Women Other Other,59.0,0,"Banana republic bottoms, Candies skirt with ma...",0
6,6,Acacia pacific tides santorini top,3,Women Swimwear Two-Piece,64.0,0,Size small but straps slightly shortened to fi...,1
...,...,...,...,...,...,...,...,...
1482526,1482526,Harry Potter Shirt! Women M/ Girl XL,2,Women Tops & Blouses T-Shirts,12.0,0,"Great Harry Potter Shirt! ""Hogwarts, School of...",0
1482527,1482527,Blk/white ribbed mock neck bodysuit M,1,Women Tops & Blouses Blouse,10.0,1,Brand new black and white ribbed mock neck bod...,0
1482528,1482528,Victoria's Secret Tankini Sz. Large,2,Women Athletic Apparel Sports Bras,18.0,1,Purple and Paisley Victoria's Secret Tankini S...,1
1482533,1482533,World markets lanterns,3,Home Home Décor Home Décor Accents,45.0,1,There is 2 of each one that you see! So 2 red ...,0


In [59]:
sample_df = data.sample(n=100000, random_state = 42)

In [60]:
sample_df

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True
1392338,1392338,Size small Gianni Bini Dress,2,"Women Dresses Above Knee, Mini",14.0,0,Beautiful dress from Dilliards. Size small. La...,0
1125521,1125521,36B Victoria's Secret & Aerie Bra Set,3,Women Underwear Bras,41.0,0,Wireless. Good Used Condition. Smoke free home...,1
662640,662640,Victoria's Secret Leggings,2,"Women Athletic Apparel Pants, Tights, Leggings",24.0,0,From a smoke free home Size small,1
195122,195122,Morphe 35P palette w/FREE GIFTS!!,3,Beauty Makeup Makeup Palettes,19.0,0,"Unused, but one shadow broken. A little bit of...",1
1319269,1319269,Monster high doll for ashley,3,Kids Toys Dolls & Accessories,11.0,1,Duece,0
...,...,...,...,...,...,...,...,...
51125,51125,iPhone 7 Plus Otter Box Defender,2,"Electronics Cell Phones & Accessories Cases, C...",26.0,0,Barely used still like new,1
11925,11925,Lularoe Azure XL,3,Women Dresses Knee-Length,16.0,1,Violet azure with white polka dots. Fits like ...,1
875729,875729,7 Bundle Wii + GameCube Games,3,Electronics Video Games & Consoles Games,22.0,1,Total of 7 games - rec room - Mario sports mix...,1
1250401,1250401,Huda beauty,1,Beauty Makeup Makeup Palettes,21.0,0,Brand new never used,1


In [79]:
# import SentimentIntensityAnalyzer class
# from vaderSentiment.vaderSentiment module.
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
# function to print sentiments
# of the sentence.
def sentiment_scores(sentence):
 
    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()
 
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(sentence)
     
    sentiment_dict['compound']  
    return(sentiment_dict['compound'])
#     print("Overall sentiment dictionary is : ", sentiment_dict['compound'])
#     print("sentence was rated as ", sentiment_dict['neg']*100, "% Negative")
#     print("sentence was rated as ", sentiment_dict['neu']*100, "% Neutral")
#     print("sentence was rated as ", sentiment_dict['pos']*100, "% Positive")
 
#     print("Sentence Overall Rated As", end = " ")
 
    # decide sentiment as positive, negative and neutral
#     if sentiment_dict['compound'] >= 0.05 :
#         print("Positive")
 
#     elif sentiment_dict['compound'] <= - 0.05 :
#         print("Negative")
 
#     else :
#         print("Neutral")
 
 
   
# Driver code
if __name__ == "__main__" :
 
    print("\n1st statement :")
    sentence = "Geeks For Geeks is the best portal for \
                the computer science engineering students."
 
    # function calling
    sentiment_scores(sentence)
 
    print("\n2nd Statement :")
    sentence = "study is going on as usual"
    sentiment_scores(sentence)
 
    print("\n3rd Statement :")
    sentence = "I am very sad today."
    sentiment_scores(sentence)


1st statement :

2nd Statement :

3rd Statement :


In [82]:
sentence

'I am very sad today.'

In [80]:
ss = sentiment_scores(sentence)

In [81]:
ss

-0.5256

In [83]:
sample_df['compund_sentiment'] = sample_df['item_description'].apply(lambda x: sentiment_scores(x) )

In [85]:
sample_df

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True,compund_sentiment
1392338,1392338,Size small Gianni Bini Dress,2,"Women Dresses Above Knee, Mini",14.0,0,Beautiful dress from Dilliards. Size small. La...,0,0.6988
1125521,1125521,36B Victoria's Secret & Aerie Bra Set,3,Women Underwear Bras,41.0,0,Wireless. Good Used Condition. Smoke free home...,1,0.7351
662640,662640,Victoria's Secret Leggings,2,"Women Athletic Apparel Pants, Tights, Leggings",24.0,0,From a smoke free home Size small,1,0.5106
195122,195122,Morphe 35P palette w/FREE GIFTS!!,3,Beauty Makeup Makeup Palettes,19.0,0,"Unused, but one shadow broken. A little bit of...",1,0.9750
1319269,1319269,Monster high doll for ashley,3,Kids Toys Dolls & Accessories,11.0,1,Duece,0,0.0000
...,...,...,...,...,...,...,...,...,...
51125,51125,iPhone 7 Plus Otter Box Defender,2,"Electronics Cell Phones & Accessories Cases, C...",26.0,0,Barely used still like new,1,0.3041
11925,11925,Lularoe Azure XL,3,Women Dresses Knee-Length,16.0,1,Violet azure with white polka dots. Fits like ...,1,0.4753
875729,875729,7 Bundle Wii + GameCube Games,3,Electronics Video Games & Consoles Games,22.0,1,Total of 7 games - rec room - Mario sports mix...,1,0.7351
1250401,1250401,Huda beauty,1,Beauty Makeup Makeup Palettes,21.0,0,Brand new never used,1,0.0000


In [87]:
sample_df['compund_sentiment_desc'] =  sample_df['compund_sentiment']

In [94]:
vader_df = sample_df

In [95]:
vader_df

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,brand_mention_True,compund_sentiment_desc
1392338,1392338,Size small Gianni Bini Dress,2,"Women Dresses Above Knee, Mini",14.0,0,Beautiful dress from Dilliards. Size small. La...,0,0.6988
1125521,1125521,36B Victoria's Secret & Aerie Bra Set,3,Women Underwear Bras,41.0,0,Wireless. Good Used Condition. Smoke free home...,1,0.7351
662640,662640,Victoria's Secret Leggings,2,"Women Athletic Apparel Pants, Tights, Leggings",24.0,0,From a smoke free home Size small,1,0.5106
195122,195122,Morphe 35P palette w/FREE GIFTS!!,3,Beauty Makeup Makeup Palettes,19.0,0,"Unused, but one shadow broken. A little bit of...",1,0.9750
1319269,1319269,Monster high doll for ashley,3,Kids Toys Dolls & Accessories,11.0,1,Duece,0,0.0000
...,...,...,...,...,...,...,...,...,...
51125,51125,iPhone 7 Plus Otter Box Defender,2,"Electronics Cell Phones & Accessories Cases, C...",26.0,0,Barely used still like new,1,0.3041
11925,11925,Lularoe Azure XL,3,Women Dresses Knee-Length,16.0,1,Violet azure with white polka dots. Fits like ...,1,0.4753
875729,875729,7 Bundle Wii + GameCube Games,3,Electronics Video Games & Consoles Games,22.0,1,Total of 7 games - rec room - Mario sports mix...,1,0.7351
1250401,1250401,Huda beauty,1,Beauty Makeup Makeup Palettes,21.0,0,Brand new never used,1,0.0000


In [96]:
vader_df.to_csv('vad',index=False)