Use a pretrained word-embedding (word2vec, glove or fasttext) for featurization instead of the
bag-of-words model. Does this improve classification? How about combining the embedded
words with the BoW model?

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wjdos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from category_encoders import TargetEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer

In [19]:
df = pd.read_csv('winemag-data-130k-v2.csv')
df = df[df['country'] == 'US']
df_sample = df.sample(1000)


In [7]:
import spacy
import en_core_web_lg

nlp = spacy.load('en_core_web_lg', disable = ['tagger', 'parser', 'ner'])

In [18]:
X_col = ['price','designation', 'province', 'region_1','region_2','variety', 'taster_name',
        'text']
cont = ['price']
cat = ['designation', 'province', 'region_1','region_2','variety', 'taster_name']
text = ['text']

df_sample["text"] =  df_sample['description']+ df_sample['title'] + df_sample['winery']

y = df_sample['points']
X = df_sample.loc[:, X_col]


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

        price            designation    province              region_1  \
26502    18.0   Estate Grown Rosé of  California  Russian River Valley   
29465    18.0                    NaN  Washington  Columbia Valley (WA)   
61303    12.0                    NaN  California                  Lodi   
128905   30.0    Zena Crown Vineyard      Oregon      Eola-Amity Hills   
25468    32.0                    NaN  California           Edna Valley   
...       ...                    ...         ...                   ...   
35993    38.0        Smith Vineyards  California      Dry Creek Valley   
115025   14.0               Demi-Sec     America                   NaN   
98923    50.0           Cuvée Moriah  California         Sonoma County   
44347    40.0  Tephra Ridge Vineyard  California           Lake County   
68732    48.0   Essence Estate Grown  California           Napa Valley   

                 region_2                variety     taster_name  \
26502              Sonoma             Pinot

### Cleaning (punctuations, stopwords)

In [9]:
import re
from nltk.corpus import stopwords

text_train = X_train.text.str.lower()
text_test = X_test.text.str.lower()

#remove punctuations
text_train = text_train.apply((lambda x : re.sub("[^a-z0-9\s]","",x)) )
text_test = text_test.apply((lambda x : re.sub("[^a-z0-9\s]","",x)) )

#remove stopwords
stopwords = set(stopwords.words('english'))
text_train = text_train.apply(lambda x : " ".join(w for w in x.split() if w not in stopwords))
text_test = text_test.apply(lambda x : " ".join(w for w in x.split() if w not in stopwords))

## Vectorize 

In [20]:
docs_train_text =  [nlp(d).vector for d in text_train]
docs_test_text = [nlp(d).vector for d in text_test]


X_train_word2vec = np.vstack(docs_train_text)
X_test_word2vec = np.vstack(docs_test_text)


X_train_word2vec = pd.DataFrame(X_train_word2vec)
X_train_word2vec.index = X_train.index

# print(X_train_word2vec)

X_test_word2vec = pd.DataFrame(X_test_word2vec)
X_test_word2vec.index = X_test.index

X_train_word2vec.columns = ['text'+str(i) for i in range(300)]
X_test_word2vec.columns = ['text'+str(i) for i in range(300)]

print(X_train_word2vec)

           text0     text1     text2     text3     text4     text5     text6  \
20419  -0.155254  0.146008 -0.070758 -0.215490  0.125964  0.042638  0.065526   
46710  -0.104202  0.204439 -0.097630 -0.225781  0.174469  0.039809  0.073507   
80685  -0.070993  0.014188  0.072261 -0.233255  0.121731  0.148858  0.021181   
2900   -0.032034 -0.015748 -0.034118 -0.039829  0.148092  0.081990  0.060652   
88860  -0.159887  0.166690  0.012707 -0.202276  0.169835  0.157067  0.077961   
...          ...       ...       ...       ...       ...       ...       ...   
5738   -0.077434  0.133818  0.021649 -0.313426  0.190623  0.198321  0.061229   
109737  0.006986  0.119540  0.045174 -0.036015  0.219111  0.015925 -0.060145   
51247  -0.149211  0.115970  0.009654 -0.072114  0.085058  0.011927  0.016982   
100687  0.033715  0.103318 -0.028462 -0.155140  0.177353  0.243839 -0.044566   
48960  -0.085270  0.130056  0.068531 -0.162478  0.152720  0.120353 -0.020313   

           text7     text8     text9  .

## Word Embedding only

In [9]:
print("Linear Regression mean cv score with text data (Word Embedding only) :", np.mean(cross_val_score(LinearRegression(), X_train_word2vec, y_train, cv = 5)))

Linear Regression mean cv score with text data (Word Embedding only) : 0.49388929483317956


## Word Embedding + cont&cat features

In [21]:
X_train = X_train.join(X_train_word2vec)
X_test = X_test.join(X_test_word2vec)

       price                           designation    province  \
20419   40.0  Estate Wine Middleton Petty Vineyard  Washington   
46710   21.0                           Crazy Creek  California   
80685   24.0                 The Hill and the Vale  California   
2900    40.0                         Amber's Cuvee  California   
88860   60.0                          La Rinconada  California   

                      region_1         region_2             variety  \
20419  Walla Walla Valley (WA)  Columbia Valley              Merlot   
46710         Alexander Valley           Sonoma  Cabernet Sauvignon   
80685            Sonoma County           Sonoma           Zinfandel   
2900      Santa Cruz Mountains    Central Coast     Sparkling Blend   
88860          Sta. Rita Hills    Central Coast          Pinot Noir   

            taster_name                                               text  \
20419  Sean P. Sullivan  The aromas show notes of chocolate and raspber...   
46710               

In [11]:
#drop description and title in original dataframe
#cat+cont and Wordembedding method

X_train_we = X_train.drop(text,axis = 1)
X_test_we =X_test.drop(text, axis=1)

cat_preprocessing = make_pipeline(
    SimpleImputer(strategy ='constant', fill_value = 'NA'),
    OneHotEncoder(handle_unknown = 'ignore')
    )

cont_preprocessing = make_pipeline(
    SimpleImputer(),
    StandardScaler()
    )

preprocess = ColumnTransformer(
    transformers = [
        ('cat_preprocessing', cat_preprocessing, cat),
        ('cont_preprocessing', cont_preprocessing, cont)
    ],
    remainder = 'passthrough'
)

OLS_pipe = make_pipeline(preprocess, LinearRegression())
print("Linear Regression mean cv score with Word Embedding only :", np.mean(cross_val_score(OLS_pipe, X_train_we, y_train, cv = 5)))

Linear Regression mean cv score with Word Embedding only : 0.40997952075468724


## Word Embedding with BoW(CountVectorizer) + cont/cat features

In [22]:
#Word embedding with BoW on text data
#Linear Regression
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy ='constant', fill_value = 'NA'),
    OneHotEncoder(handle_unknown = 'ignore')
    )

cont_preprocessing = make_pipeline(
    SimpleImputer(),
    StandardScaler()
    )

text_preprocessing = make_pipeline(
    CountVectorizer(ngram_range=(1,3))
    )

preprocess = make_column_transformer(
    (cont_preprocessing, cont),
    (cat_preprocessing, cat),
    (text_preprocessing, 'text'))



OLS_pipe = make_pipeline(preprocess, LinearRegression())
print("Linear Regression mean cv score:", np.mean(cross_val_score(OLS_pipe, X_train, y_train, cv = 5)))

Linear Regression mean cv score: 0.5615841091261851


## Word Embedding with BoW(TfidfVectorizer) + cont/cat features

In [13]:
#Word embedding with BoW on text data
#Linear Regression
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy ='constant', fill_value = 'NA'),
    OneHotEncoder(handle_unknown = 'ignore')
    )

cont_preprocessing = make_pipeline(
    SimpleImputer(),
    StandardScaler()
    )

text_preprocessing = make_pipeline(
    TfidfVectorizer(ngram_range=(1,3))
    )

preprocess = make_column_transformer(
    (cont_preprocessing, cont),
    (cat_preprocessing, cat),
    (text_preprocessing, 'text')
)


OLS_pipe = make_pipeline(preprocess, LinearRegression())
print("Linear Regression mean cv score:", np.mean(cross_val_score(OLS_pipe, X_train, y_train, cv = 5)))

Linear Regression mean cv score: 0.5660923944673193


In [14]:
#Ridge Regression
from sklearn.linear_model import Ridge

Ridge_pipe = make_pipeline(preprocess, Ridge())

print("Ridge mean cv score :", np.mean(cross_val_score(Ridge_pipe, X_train, y_train, cv = 5)))

Ridge mean cv score : 0.5434777633630656


## Running on the whole data

In [15]:
from nltk.corpus import stopwords

X_col = ['price','designation', 'province', 'region_1','region_2','variety', 'taster_name',
        'text']
cont = ['price']
cat = ['designation', 'province', 'region_1','region_2','variety', 'taster_name']
text = ['text']

df["text"] =  df['description']+ df['title'] + df['winery']

y = df['points']
X = df.loc[:, X_col]



X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

text_train = X_train.text.str.lower()
text_test = X_test.text.str.lower()

#remove punctuations
text_train = text_train.apply((lambda x : re.sub("[^a-z0-9\s]","",x)) )
text_test = text_test.apply((lambda x : re.sub("[^a-z0-9\s]","",x)) )

#remove stopwords
stopwords = set(stopwords.words('english'))
text_train = text_train.apply(lambda x : " ".join(w for w in x.split() if w not in stopwords))
text_test = text_test.apply(lambda x : " ".join(w for w in x.split() if w not in stopwords))

docs_train_text =  [nlp(d).vector for d in text_train]
docs_test_text = [nlp(d).vector for d in text_test]

X_train_word2vec = np.vstack(docs_train_text)
X_test_word2vec = np.vstack(docs_test_text)

X_train_word2vec = pd.DataFrame(X_train_word2vec)
X_train_word2vec.index = X_train.index

X_test_word2vec = pd.DataFrame(X_test_word2vec)
X_test_word2vec.index = X_test.index

X_train_word2vec.columns = ['text'+str(i) for i in range(300)]
X_test_word2vec.columns = ['text'+str(i) for i in range(300)]

cat_preprocessing = make_pipeline(
    SimpleImputer(strategy ='constant', fill_value = 'NA'),
    OneHotEncoder(handle_unknown = 'ignore')
    )

cont_preprocessing = make_pipeline(
    SimpleImputer(),
    StandardScaler()
    )

text_preprocessing = make_pipeline(
    CountVectorizer(ngram_range=(1,3))
    )

preprocess = make_column_transformer(
    (cont_preprocessing, cont),
    (cat_preprocessing, cat),
    (text_preprocessing, 'text')
)

Ridge_pipe = make_pipeline(preprocess, Ridge())

print("Ridge mean on whole data cv score :", np.mean(cross_val_score(Ridge_pipe, X_train, y_train, cv = 5)))

Ridge mean on whole data cv score : 0.7848556793672598
