In [46]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import emoji
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix, hstack
from xgboost import XGBRegressor

In [2]:
df_price_dataset = pd.read_table('train.tsv')

In [3]:
display(df_price_dataset)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity
...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl..."
1482531,1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,0,Little mermaid handmade dress never worn size 2t
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape."
1482533,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...


In [4]:
missing_values_count = df_price_dataset.isnull().sum()
#look at the missing value
print(missing_values_count)

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          6
dtype: int64


In [6]:
def process_category(data):
    for i in range(3):
        
        def get_part(x):
            
            if type(x) != str:
                return np.nan
        
            parts = x.split('/')
            
            if i >= len(parts):
                return np.nan
            else:
                return parts[i]

        field_name = 'category_' + str(i)
        
        data[field_name] = data['category_name'].apply(get_part)
    
    return data

In [7]:
process_category(df_price_dataset)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_0,category_1,category_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces
...,...,...,...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Women/Dresses/Mid-Calf,Free People,20.0,1,"Lace, says size small but fits medium perfectl...",Women,Dresses,Mid-Calf
1482531,1482531,Little mermaid handmade dress,2,Kids/Girls 2T-5T/Dresses,Disney,14.0,0,Little mermaid handmade dress never worn size 2t,Kids,Girls 2T-5T,Dresses
1482532,1482532,21 day fix containers and eating plan,2,Sports & Outdoors/Exercise/Fitness accessories,,12.0,0,"Used once or twice, still in great shape.",Sports & Outdoors,Exercise,Fitness accessories
1482533,1482533,World markets lanterns,3,Home/Home Décor/Home Décor Accents,,45.0,1,There is 2 of each one that you see! So 2 red ...,Home,Home Décor,Home Décor Accents


In [8]:
df_price_dataset.fillna({'brand_name': ' ', 'category_0': 'other', 'category_1': 'other', 'category_2': 'other'}, inplace = True)

In [10]:
df_price_dataset = df_price_dataset.drop(columns = ['category_name'], axis = 1)

In [16]:
missing_values_count = df_price_dataset.isnull().sum()
#look at the missing value
print(missing_values_count)

train_id             0
name                 0
item_condition_id    0
brand_name           0
price                0
shipping             0
item_description     0
category_0           0
category_1           0
category_2           0
dtype: int64


In [15]:
df_price_dataset = df_price_dataset.dropna()

In [17]:
display(df_price_dataset)

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,category_0,category_1,category_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces
...,...,...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Free People,20.0,1,"Lace, says size small but fits medium perfectl...",Women,Dresses,Mid-Calf
1482531,1482531,Little mermaid handmade dress,2,Disney,14.0,0,Little mermaid handmade dress never worn size 2t,Kids,Girls 2T-5T,Dresses
1482532,1482532,21 day fix containers and eating plan,2,,12.0,0,"Used once or twice, still in great shape.",Sports & Outdoors,Exercise,Fitness accessories
1482533,1482533,World markets lanterns,3,,45.0,1,There is 2 of each one that you see! So 2 red ...,Home,Home Décor,Home Décor Accents


In [23]:
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [21]:
def clean_text(text):
    if not isinstance(text, str):
        return ''  # Return empty string for non-string inputs
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove single quotes, double quotes
    text = re.sub(r"[\'\"]", "", text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = decontracted(text)
    # Remove emojis
    text = emoji.demojize(text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and word not in list(string.punctuation)]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    cleaned_text = ' '.join(lemmatized_tokens)
    return cleaned_text

In [24]:
df_price_dataset['item_description'] = df_price_dataset['item_description'].apply(lambda x: clean_text(x))

In [25]:
display(df_price_dataset)

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,category_0,category_1,category_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,,10.0,1,description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,keyboard great condition work like came box po...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Target,10.0,1,adorable top hint lace key hole back pale pink...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,,35.0,1,new tag leather horse retail rm stand foot hig...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,,44.0,0,complete certificate authenticity,Women,Jewelry,Necklaces
...,...,...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Free People,20.0,1,lace say size small fit medium perfectly never...,Women,Dresses,Mid-Calf
1482531,1482531,Little mermaid handmade dress,2,Disney,14.0,0,little mermaid handmade dress never worn size 2t,Kids,Girls 2T-5T,Dresses
1482532,1482532,21 day fix containers and eating plan,2,,12.0,0,used twice still great shape,Sports & Outdoors,Exercise,Fitness accessories
1482533,1482533,World markets lanterns,3,,45.0,1,2 one see 2 red 2 orange 2 big red orange one ...,Home,Home Décor,Home Décor Accents


In [26]:
def get_features(data):
    
    luxury_brands = ["MCM", "MCM Worldwide", "Louis Vuitton", "Burberry", "Burberry London", "Burberry Brit", "HERMES", "Tieks",
                     "Rolex", "Apple", "Gucci", "Valentino", "Valentino Garavani", "RED Valentino", "Cartier", "Christian Louboutin",
                     "Yves Saint Laurent", "Saint Laurent", "YSL Yves Saint Laurent", "Georgio Armani", "Armani Collezioni", "Emporio Armani"]
    df_price_dataset['is_luxurious'] = (df_price_dataset['brand_name'].isin(luxury_brands)).astype(np.int8)

    expensive_brands = ["Michael Kors", "Louis Vuitton", "Lululemon", "LuLaRoe", "Kendra Scott", "Tory Burch", "Apple", "Kate Spade",
                  "UGG Australia", "Coach", "Gucci", "Rae Dunn", "Tiffany & Co.", "Rock Revival", "Adidas", "Beats", "Burberry",
                  "Christian Louboutin", "David Yurman", "Ray-Ban", "Chanel"]
    df_price_dataset['is_expensive'] = (df_price_dataset['brand_name'].isin(expensive_brands)).astype(np.int8)
    return data

In [27]:
get_features(df_price_dataset)

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,category_0,category_1,category_2,is_luxurious,is_expensive
0,0,MLB Cincinnati Reds T Shirt Size XL,3,,10.0,1,description yet,Men,Tops,T-shirts,0,0
1,1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,keyboard great condition work like came box po...,Electronics,Computers & Tablets,Components & Parts,0,0
2,2,AVA-VIV Blouse,1,Target,10.0,1,adorable top hint lace key hole back pale pink...,Women,Tops & Blouses,Blouse,0,0
3,3,Leather Horse Statues,1,,35.0,1,new tag leather horse retail rm stand foot hig...,Home,Home Décor,Home Décor Accents,0,0
4,4,24K GOLD plated rose,1,,44.0,0,complete certificate authenticity,Women,Jewelry,Necklaces,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired Dress,2,Free People,20.0,1,lace say size small fit medium perfectly never...,Women,Dresses,Mid-Calf,0,0
1482531,1482531,Little mermaid handmade dress,2,Disney,14.0,0,little mermaid handmade dress never worn size 2t,Kids,Girls 2T-5T,Dresses,0,0
1482532,1482532,21 day fix containers and eating plan,2,,12.0,0,used twice still great shape,Sports & Outdoors,Exercise,Fitness accessories,0,0
1482533,1482533,World markets lanterns,3,,45.0,1,2 one see 2 red 2 orange 2 big red orange one ...,Home,Home Décor,Home Décor Accents,0,0


In [30]:
df_price_dataset['name'] = df_price_dataset['name']+' '+df_price_dataset['brand_name']

In [31]:
df_price_dataset

Unnamed: 0,train_id,name,item_condition_id,brand_name,price,shipping,item_description,category_0,category_1,category_2,is_luxurious,is_expensive
0,0,MLB Cincinnati Reds T Shirt Size XL,3,,10.0,1,description yet,Men,Tops,T-shirts,0,0
1,1,Razer BlackWidow Chroma KeyboardRazer Razer,3,Razer,52.0,0,keyboard great condition work like came box po...,Electronics,Computers & Tablets,Components & Parts,0,0
2,2,AVA-VIV BlouseTarget Target,1,Target,10.0,1,adorable top hint lace key hole back pale pink...,Women,Tops & Blouses,Blouse,0,0
3,3,Leather Horse Statues,1,,35.0,1,new tag leather horse retail rm stand foot hig...,Home,Home Décor,Home Décor Accents,0,0
4,4,24K GOLD plated rose,1,,44.0,0,complete certificate authenticity,Women,Jewelry,Necklaces,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1482530,1482530,Free People Inspired DressFree People Free People,2,Free People,20.0,1,lace say size small fit medium perfectly never...,Women,Dresses,Mid-Calf,0,0
1482531,1482531,Little mermaid handmade dressDisney Disney,2,Disney,14.0,0,little mermaid handmade dress never worn size 2t,Kids,Girls 2T-5T,Dresses,0,0
1482532,1482532,21 day fix containers and eating plan,2,,12.0,0,used twice still great shape,Sports & Outdoors,Exercise,Fitness accessories,0,0
1482533,1482533,World markets lanterns,3,,45.0,1,2 one see 2 red 2 orange 2 big red orange one ...,Home,Home Décor,Home Décor Accents,0,0


In [32]:
df_price_dataset['name'] = df_price_dataset['name'].apply(lambda x: clean_text(x))

In [34]:
df_price_dataset.drop('brand_name', axis = 1,inplace=True)

In [35]:
display(df_price_dataset)

Unnamed: 0,train_id,name,item_condition_id,price,shipping,item_description,category_0,category_1,category_2,is_luxurious,is_expensive
0,0,mlb cincinnati red shirt size xl,3,10.0,1,description yet,Men,Tops,T-shirts,0,0
1,1,razer blackwidow chroma keyboardrazer razer,3,52.0,0,keyboard great condition work like came box po...,Electronics,Computers & Tablets,Components & Parts,0,0
2,2,avaviv blousetarget target,1,10.0,1,adorable top hint lace key hole back pale pink...,Women,Tops & Blouses,Blouse,0,0
3,3,leather horse statue,1,35.0,1,new tag leather horse retail rm stand foot hig...,Home,Home Décor,Home Décor Accents,0,0
4,4,24k gold plated rose,1,44.0,0,complete certificate authenticity,Women,Jewelry,Necklaces,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1482530,1482530,free people inspired dressfree people free people,2,20.0,1,lace say size small fit medium perfectly never...,Women,Dresses,Mid-Calf,0,0
1482531,1482531,little mermaid handmade dressdisney disney,2,14.0,0,little mermaid handmade dress never worn size 2t,Kids,Girls 2T-5T,Dresses,0,0
1482532,1482532,21 day fix container eating plan,2,12.0,0,used twice still great shape,Sports & Outdoors,Exercise,Fitness accessories,0,0
1482533,1482533,world market lantern,3,45.0,1,2 one see 2 red 2 orange 2 big red orange one ...,Home,Home Décor,Home Décor Accents,0,0


In [36]:
data = df_price_dataset[['price', 'name', 'category_0', 'category_1',
       'category_2', 'shipping', 'item_condition_id', 'is_expensive', 'is_luxurious', 'item_description']]

In [39]:
y = data['price']
X = data.drop('price', axis = 1)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size = 0.5, random_state = 42)

In [51]:
def get_ohe(X_train,X_val, X_test, col_name):
    vect = CountVectorizer()
    tr_ohe = vect.fit_transform(X_train[col_name].values)
    val_ohe = vect.transform(X_val[col_name].values)
    te_ohe = vect.transform(X_test[col_name].values)
    return tr_ohe,val_ohe,te_ohe

In [52]:
def get_text_encodings(X_train,X_val,X_test, col_name, min_val, max_val):
    vect = TfidfVectorizer(min_df = 10, ngram_range = (min_val, max_val), max_features = 1000000)
    tr_text = vect.fit_transform(X_train[col_name].values)
    val_text = vect.transform(X_val[col_name].values)
    te_text = vect.transform(X_test[col_name].values)
    return tr_text,val_text, te_text

In [56]:
def generate_encodings(X_train,X_val,X_test):
    tr_ohe_category_0,val_ohe_category_0,te_ohe_category_0 = get_ohe(X_train,X_val, X_test, 'category_0')
    tr_ohe_category_1,val_ohe_category_1,te_ohe_category_1 = get_ohe(X_train,X_val, X_test, 'category_1')
    tr_ohe_category_2,val_ohe_category_2,te_ohe_category_2 = get_ohe(X_train,X_val,X_test, 'category_2')

    tr_trans = csr_matrix(pd.get_dummies(X_train[['shipping', 'item_condition_id', 'is_expensive', 'is_luxurious']], sparse=True).values)
    val_trans = csr_matrix(pd.get_dummies(X_val[['shipping', 'item_condition_id', 'is_expensive', 'is_luxurious']], sparse=True).values)
    te_trans = csr_matrix(pd.get_dummies(X_test[['shipping', 'item_condition_id', 'is_expensive', 'is_luxurious']], sparse=True).values)

    tr_name,val_name,te_name = get_text_encodings(X_train,X_val,X_test, 'name', 1, 1)
    tr_text, val_text,te_text = get_text_encodings(X_train,X_val,X_test, 'item_description', 1, 2)

    train_data = hstack((tr_ohe_category_0, tr_ohe_category_1, tr_ohe_category_2, tr_trans, \
                       tr_name, tr_text)).tocsr().astype('float32')
    
    val_data = hstack((val_ohe_category_0, val_ohe_category_1, val_ohe_category_2, val_trans, \
                       val_name, val_text)).tocsr().astype('float32')


    test_data = hstack((te_ohe_category_0, te_ohe_category_1, te_ohe_category_2, te_trans, \
                       te_name, te_text)).tocsr().astype('float32')

    return train_data, val_data,test_data

In [57]:
train_data, val_data,test_data = generate_encodings(X_train,X_val,X_test)
print(train_data.shape, y_train.shape)
print(val_data.shape, y_val.shape)
print(test_data.shape, y_test.shape)

(1186023, 284412) (1186023,)
(741264, 284412) (741264,)
(741265, 284412) (741265,)


In [59]:
def get_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [60]:
xgb_model = XGBRegressor()
xgb_model.fit(train_data, y_train)

predictions = xgb_model.predict(train_data)
train_score = get_rmsle(np.expm1(y_train), np.expm1(predictions))
print('Train rmsle explained: ', train_score)

predictions = xgb_model.predict(test_data)
test_score = get_rmsle(np.expm1(y_test), np.expm1(predictions))
print('Test rmsle explained: ', test_score)

XGBoostError: [00:31:12] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\common\io.h:232: bad_malloc: Failed to allocate 5758631424 bytes.