In [296]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from scipy.sparse import hstack
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
import numpy as np
import time
import re
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [297]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [298]:
train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
4,6,Davin de Kergommeaux,"After 40 years in barrels, the trademark Canad...",199.0,96,45.0,


In [299]:
train.dropna(inplace=True)

In [300]:
train.head()

Unnamed: 0,id,author,description,price,ratingValue,pert_alcohol,category
0,1,John Hansell,A marriage of 13 and 18 year old bourbons. A m...,85.0,97,51.5,2.0
1,2,Dave Broom,There have been some legendary Bowmores from t...,13500.0,97,42.9,1.0
2,3,John Hansell,This bottling celebrates master distiller Park...,150.0,97,50.0,2.0
3,4,John Hansell,What impresses me most is how this whisky evol...,4500.0,97,40.5,1.0
5,9,Fred Minnick,"A caramel-laden fruit bouquet, followed by une...",150.0,96,54.49,2.0


In [301]:
testX = test.drop(['id'],axis=1)

In [302]:
from sklearn import preprocessing

In [303]:
leauthor = preprocessing.LabelEncoder()
lerating = preprocessing.LabelEncoder()

In [304]:
trainX= train.drop(['category','id'],axis=1)
trainY = train['category'].astype(int)

In [305]:
leauthor.fit(trainX['author'])
lerating.fit(trainX['ratingValue'])

LabelEncoder()

In [306]:
trainX['author']=leauthor.transform(trainX['author'])
trainX['ratingValue']=lerating.transform(trainX['ratingValue'])
testX['author']=leauthor.transform(testX['author'])
testX['ratingValue']=lerating.transform(testX['ratingValue'])

In [307]:
trainX.head()

Unnamed: 0,author,description,price,ratingValue,pert_alcohol
0,8,A marriage of 13 and 18 year old bourbons. A m...,85.0,30,51.5
1,1,There have been some legendary Bowmores from t...,13500.0,30,42.9
2,8,This bottling celebrates master distiller Park...,150.0,30,50.0
3,8,What impresses me most is how this whisky evol...,4500.0,30,40.5
5,4,"A caramel-laden fruit bouquet, followed by une...",150.0,29,54.49


In [308]:
trainX['word_count'] = trainX['description'].apply(lambda x: len(str(x).split(" ")))
testX['word_count'] = testX['description'].apply(lambda x: len(str(x).split(" ")))

In [309]:
trainX.head()

Unnamed: 0,author,description,price,ratingValue,pert_alcohol,word_count
0,8,A marriage of 13 and 18 year old bourbons. A m...,85.0,30,51.5,60
1,1,There have been some legendary Bowmores from t...,13500.0,30,42.9,84
2,8,This bottling celebrates master distiller Park...,150.0,30,50.0,124
3,8,What impresses me most is how this whisky evol...,4500.0,30,40.5,82
5,4,"A caramel-laden fruit bouquet, followed by une...",150.0,29,54.49,58


In [310]:
trainX['char_count'] = trainX['description'].str.len() ## this also includes spaces
testX['char_count'] = testX['description'].str.len() 
trainX[['description','char_count']].head()

Unnamed: 0,description,char_count
0,A marriage of 13 and 18 year old bourbons. A m...,361
1,There have been some legendary Bowmores from t...,503
2,This bottling celebrates master distiller Park...,824
3,What impresses me most is how this whisky evol...,495
5,"A caramel-laden fruit bouquet, followed by une...",415


In [311]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))



In [312]:
trainX['avg_word'] = trainX['description'].apply(lambda x: avg_word(x))
testX['avg_word'] = testX['description'].apply(lambda x: avg_word(x))
trainX[['description','avg_word']].head()

Unnamed: 0,description,avg_word
0,A marriage of 13 and 18 year old bourbons. A m...,5.033333
1,There have been some legendary Bowmores from t...,5.0
2,This bottling celebrates master distiller Park...,5.6
3,What impresses me most is how this whisky evol...,5.04878
5,"A caramel-laden fruit bouquet, followed by une...",6.172414


In [313]:
from nltk.corpus import stopwords
stop = stopwords.words('english')



In [314]:
trainX['stopwords'] = trainX['description'].apply(lambda x: len([x for x in x.split() if x in stop]))
testX['stopwords'] = testX['description'].apply(lambda x: len([x for x in x.split() if x in stop]))
trainX[['description','stopwords']].head()

Unnamed: 0,description,stopwords
0,A marriage of 13 and 18 year old bourbons. A m...,19
1,There have been some legendary Bowmores from t...,35
2,This bottling celebrates master distiller Park...,34
3,What impresses me most is how this whisky evol...,26
5,"A caramel-laden fruit bouquet, followed by une...",10


In [315]:
trainX['numerics'] = trainX['description'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
testX['numerics'] = testX['description'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
trainX[['description','numerics']].head()

Unnamed: 0,description,numerics
0,A marriage of 13 and 18 year old bourbons. A m...,2
1,There have been some legendary Bowmores from t...,0
2,This bottling celebrates master distiller Park...,2
3,What impresses me most is how this whisky evol...,0
5,"A caramel-laden fruit bouquet, followed by une...",0


In [316]:
trainX['upper'] = trainX['description'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
testX['upper'] = testX['description'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
trainX[['description','upper']].head()

Unnamed: 0,description,upper
0,A marriage of 13 and 18 year old bourbons. A m...,3
1,There have been some legendary Bowmores from t...,2
2,This bottling celebrates master distiller Park...,2
3,What impresses me most is how this whisky evol...,0
5,"A caramel-laden fruit bouquet, followed by une...",1


In [317]:
trainX1 = trainX['description']
testX1 = testX['description']

In [318]:
def tokenizer(text):
    if text:
        result = re.findall('[a-z]{2,}', text.lower())
    else:
        result = []
    return result

In [319]:
vect = TfidfVectorizer(tokenizer=tokenizer, stop_words='english')
start = time.time()
vect.fit(trainX1)
X_train_vect = vect.transform(trainX1)
X_test_vect = vect.transform(testX1)
end = time.time()
print('Time to train vectorizer and transform training text: %0.2fs' % (end - start))

Time to train vectorizer and transform training text: 0.97s


In [320]:
vector = X_train_vect.todense()
test_vector = X_test_vect.todense()

In [321]:
print(vector.shape)
print(test_vector.shape)

(2476, 8466)
(288, 8466)


In [322]:
vector=pd.DataFrame(vector)
test_vector=pd.DataFrame(test_vector)

In [323]:
trainX=trainX.reset_index()
vector=vector.reset_index()
testX = testX.reset_index()
test_vector = test_vector.reset_index()

In [324]:
newTrain= pd.concat([trainX,vector], axis=1)
newTest = pd.concat([testX,test_vector],axis=1)

In [325]:
newTrain.head()

Unnamed: 0,index,author,description,price,ratingValue,pert_alcohol,word_count,char_count,avg_word,stopwords,...,8456,8457,8458,8459,8460,8461,8462,8463,8464,8465
0,0,8,A marriage of 13 and 18 year old bourbons. A m...,85.0,30,51.5,60,361,5.033333,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,There have been some legendary Bowmores from t...,13500.0,30,42.9,84,503,5.0,35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,8,This bottling celebrates master distiller Park...,150.0,30,50.0,124,824,5.6,34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,8,What impresses me most is how this whisky evol...,4500.0,30,40.5,82,495,5.04878,26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,4,"A caramel-laden fruit bouquet, followed by une...",150.0,29,54.49,58,415,6.172414,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [326]:
X_train=newTrain.drop(['description','index'],axis=1)
X_test = newTest.drop(['description','index'],axis=1)

In [327]:
X_train=np.array(X_train)
X_test=np.array(X_test)

In [328]:
#model = SGDRegressor(loss='squared_loss', penalty='l2', random_state=seed, max_iter=5)
#sgd = SGDClassifier(loss="hinge", penalty="l2")
start = time.time()
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=50, learning_rate=0.03).fit(X_train, trainY)
end = time.time()
print('Time to train model: %0.2fs' % (end -start))

Time to train model: 171.20s


In [329]:
start = time.time()
y_pred=gbm.predict(X_test)
end = time.time()
print('Time to train model: %0.2fs' % (end -start))


Time to train model: 1.68s


  if diff:


In [330]:
test['category'] = pd.DataFrame(y_pred)

In [331]:
sub = test.drop(['author','description','price','ratingValue','pert_alcohol'],axis=1)

In [332]:
sub.to_csv('sub2.csv',index=False)