# Simple XGBoost Regressor with TF-IDF 

#### TF IDF applied to all text present in data, that means I have concatenated all string columns then applied Tf Idf vectorizer. I believe that working on each column seperately and performing some Hyper Parameter Tuning (for ex with grid-search) could improve much more the model.

In [24]:
import pandas as pd
import numpy as np
import string
import xgboost as xgb

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from collections import Counter
from collections import defaultdict
from collections import Counter

from sklearn.preprocessing import StandardScaler

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Cleaning the Data

In [2]:
def remove_points(text):
    kein_punkt="".join([c for c in text if c not in string.punctuation])
    return kein_punkt

In [3]:
#tokenizer
def tokenize(text):
    tokenizer=RegexpTokenizer(r'\w+')
    text=tokenizer.tokenize(text.lower())
    return text

In [4]:
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)
#use of dict to reduce processing time since a search of a word in a dict is o(1)
def remove_stopWords(text):
    words = [w for w in text if w not in stopwords_dict]
    return words

In [5]:
stemmer=PorterStemmer()
def word_stemmer(text):
    stem_text=[stemmer.stem(w) for w in text]
    return stem_text

In [6]:
#Preprocessing function
def preprocessing(text,rm_points=False,token=False,rm_stop=False,stem=False):
    if rm_points:
        text=remove_points(text)
    if token:
        text=tokenize(text)
    if rm_stop:
        text=remove_stopWords(text)
    if stem:
        text=word_stemmer(text)
    text=" ".join([w for w in text])
    return text

In [7]:
data=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

# Tf Idf Vectorizer

In [8]:
data.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'id'],
      dtype='object')

In [9]:
data["all_text"]=data['country'].astype(str)+data['description'].astype(str)+data['designation'].astype(str)+data['province'].astype(str)+data['region_1'].astype(str)+data['region_2'].astype(str)+data['taster_name'].astype(str)+data['taster_twitter_handle'].astype(str)+data[ 'title'].astype(str)+data['variety'].astype(str)+data['winery'].astype(str)

In [10]:
test["all_text"]=test['country'].astype(str)+test['description'].astype(str)+test['designation'].astype(str)+test['province'].astype(str)+test['region_1'].astype(str)+test['region_2'].astype(str)+test['taster_name'].astype(str)+test['taster_twitter_handle'].astype(str)+test[ 'title'].astype(str)+test['variety'].astype(str)+test['winery'].astype(str)

In [11]:
all_text=pd.concat([data["all_text"],test["all_text"]])

In [12]:
train_all_text=[]
test_all_text=[]

In [13]:
for idx,raw in data.iterrows():
    train_all_text.append(preprocessing(raw["all_text"],True,True,True,False))
for idx,raw in test.iterrows():
    test_all_text.append(preprocessing(raw["all_text"],True,True,True,False))

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    norm='l2',
    min_df=0,
    smooth_idf=False,
    max_features=15000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(data["all_text"])
test_word_features = word_vectorizer.transform(test["all_text"])

In [20]:
from scipy import sparse
stacked=sparse.hstack((train_word_features,np.array(data["points"])[:,None]))

In [52]:
test_stacked=sparse.hstack((test_word_features,np.array(test["points"])[:,None]))

In [53]:
test_dmatrix=xgb.DMatrix(data=test_stacked)

In [26]:
data_dmatrix = xgb.DMatrix(data=stacked,label=data["price"])

# Model Training

In [50]:
params = {"objective":"reg:squarederror",'colsample_bytree': 1,'learning_rate': 0.2,
                'max_depth': 10, 'alpha': 3,'booster':'dart'}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123,verbose_eval=2)

[0]	train-rmse:44.0022+0.362275	test-rmse:45.0368+0.700396
[2]	train-rmse:34.2939+0.321524	test-rmse:37.2147+0.614473
[4]	train-rmse:28.6768+0.333574	test-rmse:33.1803+0.625382
[6]	train-rmse:25.4342+0.2263	test-rmse:31.0721+0.749956
[8]	train-rmse:23.5492+0.173856	test-rmse:30.0246+0.773033
[10]	train-rmse:22.2519+0.153842	test-rmse:29.3894+0.861841
[12]	train-rmse:21.3689+0.10173	test-rmse:28.9026+0.890044
[14]	train-rmse:20.7298+0.112303	test-rmse:28.6061+0.922361
[16]	train-rmse:20.3117+0.118437	test-rmse:28.3922+0.933194
[18]	train-rmse:19.8793+0.104567	test-rmse:28.2003+0.911104
[20]	train-rmse:19.5409+0.0622652	test-rmse:28.052+0.902495
[22]	train-rmse:19.2648+0.0459628	test-rmse:27.9257+0.900756
[24]	train-rmse:19.0118+0.0676671	test-rmse:27.8465+0.890411
[26]	train-rmse:18.7304+0.0741378	test-rmse:27.7643+0.887743
[28]	train-rmse:18.4898+0.0752081	test-rmse:27.6664+0.873278
[30]	train-rmse:18.2086+0.0308173	test-rmse:27.5649+0.884311
[32]	train-rmse:18.0354+0.0158905	test-rmse

In [51]:
print((cv_results["test-rmse-mean"]).tail(1))

49    27.051424
Name: test-rmse-mean, dtype: float64


In [56]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=50)

In [57]:
pred=xg_reg.predict(test_dmatrix)

In [70]:
submission=pd.DataFrame({'id':test['id'],'price':pred})

In [76]:
submission.to_csv("submission.csv",index=None)

In [73]:
submission.head()

Unnamed: 0,id,price
0,0,101.782936
1,1,41.472393
2,2,29.744003
3,3,31.308027
4,4,20.481169


In [69]:
pred[0]

101.782936

In [77]:
# Final result in LB 23.99