In [2]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.impute import SimpleImputer, KNNImputer
from category_encoders import TargetEncoder
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_validate
from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import warnings
import spacy
from pandas.core.common import SettingWithCopyWarning
from transformers import pipeline
import csv
from transformers import BertTokenizer
from torch.utils.data import DataLoader, RandomSampler
import torch
from torch.utils.data import TensorDataset
from transformers import BertModel, AdamW
import torch.nn
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

# Task 3- BERT

In [3]:
# Read data and drop all non-US wines
wine_data_130 = pd.read_csv('wine-reviews/winemag-data-130k-v2.csv', index_col=0)
wine_data_130 = wine_data_130[wine_data_130['country'] == 'US'] 

In [4]:
#drop everything below a 95% condidence interval 
def remove_minor_outlier(df, columns):
    low = 0
    high = .95
    quant_df = df.quantile([low, high])
    for name in list(columns):
        df = df[(df[name] > quant_df.loc[low, name]) & (df[name] < quant_df.loc[high, name])]
    return df

In [5]:
def remove_cat_outlier(df, column):
    num_low_values = len([x for x in df[str(column)].dropna().value_counts() if x < 5])
    low_value_list = list(pd.Series(df[str(column)].dropna().value_counts()[-num_low_values:]).index)
    output = df[~df[column].isin(low_value_list)]
    return output

In [6]:
wine_data_130 = wine_data_130[wine_data_130['points'] <=97]
wine_data_130 = remove_minor_outlier(wine_data_130,['price'])
wine_data_130 = remove_cat_outlier(wine_data_130,'region_1')
wine_data_130 = remove_cat_outlier(wine_data_130,'province')
wine_data_130 = remove_cat_outlier(wine_data_130,'variety')
wine_data_130 = remove_cat_outlier(wine_data_130,'winery')
wine_data_130 = remove_cat_outlier(wine_data_130,'designation')
wine_data_130 = wine_data_130.drop(columns=['taster_twitter_handle'])
# Remove outliers and sample data, taking half of the dataset
wine_data_130 = wine_data_130.sample(frac=0.5)
points = wine_data_130['points']
X = wine_data_130.drop(columns=['country', 'points'])
X.head()

Unnamed: 0,description,designation,price,province,region_1,region_2,taster_name,title,variety,winery
9167,"A blend of five vineyards, some of which go in...",,26.0,California,Dry Creek Valley,Sonoma,Virginie Boone,Dashe Cellars 2012 Zinfandel (Dry Creek Valley),Zinfandel,Dashe Cellars
103984,Reticent dark fruit and licorice aromas are fo...,,39.0,Washington,Walla Walla Valley (WA),Columbia Valley,Sean P. Sullivan,Three Rivers 2014 Syrah (Walla Walla Valley (WA)),Syrah,Three Rivers
93142,"This is an orange citrus-tinged wine, fully la...",,25.0,California,Sonoma Coast,Sonoma,Virginie Boone,Rodney Strong 2014 Chardonnay (Sonoma Coast),Chardonnay,Rodney Strong
76177,"Mint, dill and parsley notes overlay the dark ...",Reserve,40.0,Virginia,Virginia,,,Gray Ghost 2009 Reserve Cabernet Sauvignon (Vi...,Cabernet Sauvignon,Gray Ghost
22944,"Almost jet black, this is a tough chewy wine w...",,28.0,Oregon,Rogue Valley,Southern Oregon,Paul Gregutt,South Stage 2012 Petite Sirah (Rogue Valley),Petite Sirah,South Stage


In [7]:
# Impute missing values
categorical = ['designation', 'province', 'region_1', 'region_2', 'variety', 'winery', 'taster_name']
continuous = ['price']
impute_categorical = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
impute_float = KNNImputer(missing_values=np.nan)

In [8]:
X[categorical] = impute_categorical.fit_transform(X[categorical])
X[continuous] = impute_float.fit_transform(X[continuous])

In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, points, shuffle=True, random_state=0)
cv_strategy = KFold(n_splits=10, shuffle=True, random_state=0)

# Attempt 1: Use a transformers pipeline object to get embeddings for featurization

#### About the pipeline object
- Requires that you have Tensorflow or Pytorch already installed
- Does a couple of things for you in one step: Tokenize the input so that raw strings are mapped to tokens, gives you a model, and gives you some additional post-processing to control output

In [10]:
nlp = pipeline("feature-extraction", model='bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




## Save the BERT encodings
- Due to RAM issues, the embeddings were first saved to disk and then loaded from the CSV file

In [11]:
features_array = 768*['Feature']

In [12]:
writer = csv.writer(open("description_encoded_X_train.csv", 'w'))
writer.writerow(features_array)
for x in X_train['description']:
    row = nlp(x)[0][0]
    writer.writerow(row)

In [13]:
writer = csv.writer(open("description_encoded_X_test.csv", 'w'))
writer.writerow(features_array)
for x in X_test['description']:
    row = nlp(x)[0][0]
    writer.writerow(row)

In [14]:
writer = csv.writer(open("title_encoded_X_train.csv", 'w'))
writer.writerow(features_array)
for x in X_train['title']:
    row = nlp(x)[0][0]
    writer.writerow(row)

In [15]:
writer = csv.writer(open("title_encoded_X_test.csv", 'w'))
writer.writerow(features_array)
for x in X_test['title']:
    row = nlp(x)[0][0]
    writer.writerow(row)

## Load the encodings

In [10]:
X_train_desc = pd.read_csv('description_encoded_X_train.csv')
X_train_title = pd.read_csv('title_encoded_X_train.csv')
X_test_desc = pd.read_csv('description_encoded_X_test.csv')
X_test_title = pd.read_csv('title_encoded_X_test.csv')

In [11]:
total_encoded_train = np.concatenate((X_train_desc, X_train_title), axis=1)
total_encoded_test = np.concatenate((X_test_desc, X_test_title), axis=1)

## Regression results when embedding the 'description' and 'title' features as separate features

In [18]:
param_grid = {'alpha': np.logspace(-3,3,15)}
ridge_grid = GridSearchCV(Ridge(), param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
ridge_grid.fit(total_encoded_train, y_train)
ridge_grid.best_score_

0.6653506149036461

In [19]:
param_grid = {'alpha': np.logspace(-3,3,15)}
lasso_grid = GridSearchCV(Lasso(max_iter=5000), param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
lasso_grid.fit(total_encoded_train, y_train)
lasso_grid.best_score_

0.6633335756068919

 ## Use the embedded text features + the non-text feature

In [12]:
X_train_no_text = X_train.drop(columns=['description', 'title'])
to_target_encode = ['designation', 'winery', 'variety', 'region_1', 'region_2']
to_ohe = ['province', 'taster_name']
to_scale = ['price']
# Create column transformer
ohe = make_pipeline(preprocessing.OneHotEncoder(handle_unknown='ignore'))
scale = make_pipeline(preprocessing.StandardScaler())
te = make_pipeline(TargetEncoder(), preprocessing.StandardScaler())
preprocessing_pipe = make_column_transformer((ohe, to_ohe),
                                             (te, to_target_encode),
                                             (scale, to_scale),
                                             remainder='passthrough')

In [13]:
X_train_no_text_processed = pd.DataFrame(preprocessing_pipe.fit_transform(X_train_no_text, y_train).toarray())
X_train_encoded = np.concatenate((total_encoded_train, X_train_no_text_processed), axis=1)

In [22]:
param_grid = {'alpha': np.logspace(-3,3,15)}
ridge_grid = GridSearchCV(Ridge(), param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
ridge_grid.fit(X_train_encoded, y_train)
ridge_grid.best_score_

0.7504268025611699

In [23]:
param_grid = {'alpha': np.logspace(-3,3,15)}
lasso_grid = GridSearchCV(Lasso(max_iter=5000), param_grid, cv=cv_strategy, n_jobs=-1, return_train_score=True)
lasso_grid.fit(X_train_encoded, y_train)
lasso_grid.best_score_

0.7516354025631206

### How does this model compare to a BoW model, and how does it compare to a model using all features?

Compared to the BoW model, using the embeddings from the pipeline did not seem to add any significant improvement. Combining the embedded features with the non-text features however, allowed for significant improvement.

# Attempt 2: Fine-tune a BERT model on the text data alone using the transformers library.
- We tried to take the embeddings from a pretrained BERT model and train an MLP for the regression task using a PyTorch framework
- This was done by tokenizing the input using a Bert Tokenizer, first training the model with the BERT layers frozen, and then unfreezing all layers and training
- Note: For simplicity, only the 'description' column was used

In [0]:
def tokenize_data(input_data, max_length):
    id_list = []
    mask_list = []
    for x in input_data:
        encoded_dict = tokenizer.encode_plus(
                          x,                     
                          add_special_tokens = True, 
                          max_length = max_length,          
                          pad_to_max_length = True,
                          return_attention_mask = True, 
                          return_tensors = 'pt',    
                    )
        id_list.append(encoded_dict['input_ids'])
        mask_list.append(encoded_dict['attention_mask'])
    return id_list, mask_list

### Load a BERT Tokenizer

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




### Determine the maximum tokenized length

In [15]:
max_len = 0
# For each training sample, tokenize appropriately (adding special values)
# And determine max length. Will be used for padding inputs
for x in X_train['description']:
    input_ids = tokenizer.encode(x, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print('Max length: ', max_len)

Max length:  188


### Tokenize data and convert to PyTorch tensors for training

In [0]:
# Tokenze input data and get the corresponding masks
train_ids, train_masks = tokenize_data(X_train['description'], max_len)
test_ids, test_masks = tokenize_data(X_test['description'], max_len)

desc_ids_tensor_test = torch.cat(test_ids, dim=0)
desc_masks_test = torch.cat(test_masks, dim=0)

desc_ids_tensor = torch.cat(train_ids, dim=0)
desc_masks = torch.cat(train_masks, dim=0)
y_train_tensor = torch.tensor(np.array(y_train))

### Create PyTorch DataSets and DataLoaders for training and testing

In [0]:
batch_size = 16
testing_dataset = TensorDataset(desc_ids_tensor_test, desc_masks_test)
training_dataset = TensorDataset(desc_ids_tensor, desc_masks, y_train_tensor)
test_dataloader = DataLoader(testing_dataset, batch_size = 1)
train_dataloader = DataLoader(training_dataset, batch_size = batch_size )

### Load a pretrained BERT Model for training

In [18]:
bert_model = BertModel.from_pretrained("bert-base-uncased", 
                                  output_hidden_states=False,
                                  output_attentions=False)

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

### Define an MSE Loss Function

In [0]:
loss_func = torch.nn.MSELoss()
device = torch.device('cuda:0')

### Take the BERT Model embeddings and input them into a simple MLP, trained for regression¶
- This is done through defining a bert_plus_mlp pytorch model

In [0]:
class bert_plus_mlp(torch.nn.Module):
    def __init__(self):
        super(bert_plus_mlp, self).__init__()
        self.bert = bert_model
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(max_len, 100),
            torch.nn.ReLU(),
            torch.nn.Linear(100, 50),
            torch.nn.ReLU(),
            torch.nn.Linear(50, 1),
        )

    # The last layer hidden states from BERT are used as embeddings/semantic 
    # representation for the input and therefore, passed into the MLP
    def forward(self, x, mask):
        bert_output = self.bert(x, attention_mask = mask)
        output_average = torch.mean(bert_output[0], axis=2).cuda()
        out = self.layers(output_average)
        return out

### Initialize model and set parameters for MLP optimization

In [0]:
bert_mlp = bert_plus_mlp()

### Initially train by freezing all BERT layers, only training the MLP

In [54]:
for child in bert_mlp.children():
    for name, param in child.named_parameters():
        if 'bert' in name:
            param.requires_grad = False

comb_optimizer = AdamW(bert_mlp.parameters(), lr = 2e-5, eps = 1e-8)
bert_mlp.cuda() 

bert_plus_mlp(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

### Train for 20 epochs with BERT layers frozen

In [55]:
epochs = 20
for e in range(0, epochs):
    bert_mlp.train()
    current_loss = 0
    for i, (X, mask, y) in enumerate(train_dataloader):
        comb_optimizer.zero_grad()
        if y.shape[0] > 1:
            y = y.view(16, 1)
        # To GPU
        X = X.to(device)
        y = y.to(device)
        mask = mask.to(device)

        y = y.type(torch.cuda.FloatTensor)   
        output = bert_mlp(X, mask)
        loss = loss_func(output, y)

        loss = loss.type(torch.cuda.FloatTensor)
        loss.backward()
        comb_optimizer.step()
        current_loss += loss.item()
        torch.cuda.empty_cache()
    print(f'Epoch: {e+1}, Loss: {current_loss/len(train_dataloader)}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1, Loss: 7770.831870897282
Epoch: 2, Loss: 7721.926086786936
Epoch: 3, Loss: 7617.852613460383
Epoch: 4, Loss: 7436.977273252589
Epoch: 5, Loss: 7175.941102157683
Epoch: 6, Loss: 6831.741350343242
Epoch: 7, Loss: 6400.370312211076
Epoch: 8, Loss: 5897.066583215837
Epoch: 9, Loss: 5330.642668413692
Epoch: 10, Loss: 4713.501822386973
Epoch: 11, Loss: 4061.599170210799
Epoch: 12, Loss: 3393.2130506165636
Epoch: 13, Loss: 2730.148662860577
Epoch: 14, Loss: 2096.071259989541
Epoch: 15, Loss: 1516.1360047470182
Epoch: 16, Loss: 1013.4971598788832
Epoch: 17, Loss: 607.444299957456
Epoch: 18, Loss: 314.6009070718077
Epoch: 19, Loss: 134.07436465088432
Epoch: 20, Loss: 46.02292917748175


### Unfreeze BERT layers and train for a few more epochs

In [0]:
for child in bert_mlp.children():
    for name, param in child.named_parameters():
        if 'bert' in name:
            param.requires_grad = True

In [57]:
epochs = 5
for e in range(0, epochs):
  # Put model in training mode
    bert_mlp.train()
    current_loss = 0
    for i, (X, mask, y) in enumerate(train_dataloader):
        comb_optimizer.zero_grad()
        if y.shape[0] > 1:
            y = y.view(16, 1)
        # To GPU
        X = X.to(device)
        y = y.to(device)
        mask = mask.to(device)

        y = y.type(torch.cuda.FloatTensor)   
        output = bert_mlp(X, mask)
        loss = loss_func(output, y)

        loss = loss.type(torch.cuda.FloatTensor)
        loss.backward()
        comb_optimizer.step()
        current_loss += loss.item()
        torch.cuda.empty_cache()
    
    print(f'Epoch: {e+1}, Loss: {current_loss/len(train_dataloader)}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1, Loss: 16.4684062377941
Epoch: 2, Loss: 10.662013383659385
Epoch: 3, Loss: 10.08172127481043
Epoch: 4, Loss: 10.044855493412921
Epoch: 5, Loss: 10.045774489464845


### Evaluate trained model on test data

In [0]:
test_outputs = []
for i, (X, mask, y) in enumerate(test_dataloader):
    comb_optimizer.zero_grad()
    X = X.to(device)
    mask = mask.to(device)
    output = bert_mlp(X, mask)   
    test_outputs.append(output.item())

In [77]:
print(f'MSE Error: {mean_squared_error(y_test, test_outputs)}')

MSE Error: 9.958164561903569
