Methods of Machine Learning and Intellectual Analysis of Data (Learning course) \
Bohdan Pavlyshenko (https://www.linkedin.com/in/bpavlyshenko/) \
Youtube video: https://www.youtube.com/watch?v=muHojPnCcGU&list=PLMQt7tnruMvF1jetHNUKjMtHKSp78H2bb&index=15

# Text Regression (Product Price Prediction Using Text Description)

Used Resource: \
https://www.kaggle.com/c/mercari-price-suggestion-challenge    

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split
import math
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix, hstack
import xgboost as xgb
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
train = pd.read_table("/content/drive/MyDrive/LNU/2 course/2 semester/Data processing systems/jupyter_notebooks_data/data/mercary_price_train.tsv")

In [3]:
train.head(3)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...


In [4]:
print(train.brand_name.nunique(), train.category_name.nunique(), train.shape)
for i in range(6):
  train = train.iloc[::2].copy()  # Slicing with step 2
  print(train.brand_name.nunique(), train.category_name.nunique(), train.shape)

4809 1287 (1482535, 8)
3948 1228 (741268, 8)
3222 1157 (370634, 8)
2554 1066 (185317, 8)
1975 968 (92659, 8)
1497 867 (46330, 8)
1118 751 (23165, 8)


In [5]:
def fill_na(df):
    df.category_name.fillna(value="na", inplace=True)
    df.brand_name.fillna(value="na", inplace=True)
    df.item_description.fillna(value="na", inplace=True)
    return (df)

train =fill_na(train)

le = LabelEncoder()
le.fit(train.category_name)
train.category_name = le.transform(train.category_name)

le.fit(train.brand_name)
train.brand_name = le.transform(train.brand_name)

train.head(3)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,429,1111,10.0,1,No description yet
64,64,Patagonia Fleece Jacket,3,366,1112,36.0,0,Men's XXL In Good Condition Minor mark on Sleeve
128,128,Zodiac tapestry,1,210,1111,15.0,1,"BRAND NEW TAPESTRY! Size 88"" by 55"" Twin tapes..."


In [6]:
tfidf = TfidfVectorizer(max_features=30000,ngram_range=(1, 3),stop_words='english')
tfidf_v = tfidf.fit_transform(train['item_description']+' '+train['name'])

In [7]:
lb = LabelBinarizer(sparse_output=True)
brand_b = lb.fit_transform(train['brand_name'])

In [8]:
lb = LabelBinarizer(sparse_output=True)
categ_b = lb.fit_transform(train['category_name'])

In [9]:
dumm_v = csr_matrix(pd.get_dummies(train[['item_condition_id', 'shipping']],sparse=True).values)

In [10]:
x_sparse=hstack([tfidf_v,categ_b,brand_b,dumm_v]).tocsr()

In [11]:
y_train = np.log1p(train["price"])

In [12]:
dtrain, dvalid, y_train, y_val = train_test_split(x_sparse, y_train, random_state=1, test_size=0.15)

In [13]:
d_train = xgb.DMatrix(dtrain, label=y_train)
d_val = xgb.DMatrix(dvalid, label=y_val)

In [14]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.1
params['max_depth'] = 15
params['silent'] = 1
params['subsample']=1
params['colsample_bytree']=0.75
params['gamma']=0
params['min_child_weight']=1
params['scale_pos_weight']=1
params['eval_metric'] = 'rmse'
watchlist = [(d_train, 'train'), (d_val, 'val')]
clf = xgb.train(params, d_train, 500, watchlist, early_stopping_rounds=5, verbose_eval=10)



Parameters: { "silent" } are not used.



[0]	train-rmse:0.72507	val-rmse:0.73390
[10]	train-rmse:0.59758	val-rmse:0.65215
[20]	train-rmse:0.53388	val-rmse:0.62447
[30]	train-rmse:0.50172	val-rmse:0.61071
[40]	train-rmse:0.48143	val-rmse:0.60313
[50]	train-rmse:0.46558	val-rmse:0.59737
[60]	train-rmse:0.45347	val-rmse:0.59343
[70]	train-rmse:0.44277	val-rmse:0.59023
[80]	train-rmse:0.43215	val-rmse:0.58704
[90]	train-rmse:0.42374	val-rmse:0.58488
[100]	train-rmse:0.41595	val-rmse:0.58298
[110]	train-rmse:0.40866	val-rmse:0.58111
[120]	train-rmse:0.40183	val-rmse:0.57939
[130]	train-rmse:0.39514	val-rmse:0.57804
[140]	train-rmse:0.38968	val-rmse:0.57692
[150]	train-rmse:0.38410	val-rmse:0.57593
[160]	train-rmse:0.37830	val-rmse:0.57482
[170]	train-rmse:0.37267	val-rmse:0.57419
[180]	train-rmse:0.36768	val-rmse:0.57349
[190]	train-rmse:0.36309	val-rmse:0.57308
[200]	train-rmse:0.35818	val-rmse:0.57206
[210]	train-rmse:0.35413	val-rmse:0.57155
[220]	train-rmse:0.34952	val-rmse:0.57090
[229]	train-rmse:0.34613	val-rmse:0.57085


In [15]:
pred = clf.predict(d_val)

In [16]:
rmsle_v = np.sqrt(np.mean((pd.Series(pred.tolist())-pd.Series(y_val.tolist()))**2))
print(f" RMSLE : {rmsle_v}")

 RMSLE : 0.5708514580095368
