<p style="font-size:30px; text-align:center; line-height:120%">
    <br> 
        <b>
        COMS 4995 Applied ML
            Homework 4 
        <br></br>
            Predicting Wine Quality: Task 1
        <br></br>
        </b> 
    <br> 
</p>
<p style="font-size:18px; text-align:left; line-height:120%">
    <br> 
        <b>
        Kirit Dhillon, Sagar Lal
        </b>
    <br> 
        <b>
        Uni: ksd2142, sl3946
        </b>
</p>

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Data Loading and Exploration

In [3]:
data = pd.read_csv("winemag-data-130k-v2.csv")
# Remove uninformative columns like "Taster Name" and "Taster Twitter Handle"
data = data.drop(['taster_name', 'taster_twitter_handle'], axis=1)

In [4]:
# Check data types for each column
print(data.dtypes)
print("Shape of entire dataset: ",data.shape)

Unnamed: 0       int64
country         object
description     object
designation     object
points           int64
price          float64
province        object
region_1        object
region_2        object
title           object
variety         object
winery          object
dtype: object
Shape of entire dataset:  (129971, 12)


### Setup Data

In [5]:
from sklearn.model_selection import train_test_split

#Split before deciding between text and non-text data to allow for joining later
X_trainval, X_test, y_trainval, y_test = train_test_split(data.drop(['points'], axis=1), data['points'])

In [6]:
# Text Data
text_X_trainval = X_trainval['description']
text_X_test = X_test['description']
# Non-Text Data
non_text_X_trainval = X_trainval.drop(['description'], axis=1).reset_index().drop(['index', 'Unnamed: 0'], axis=1)
non_text_X_test = X_test.drop(['description'], axis=1).reset_index().drop(['index', 'Unnamed: 0'], axis=1)

In [12]:
# Debugging
print("[Text] X_trainval: \t", text_X_trainval.shape, y_trainval.shape)
print("[Text] X_test: \t\t", text_X_test.shape, y_test.shape)
print("[Non-Text] X_trainval: \t", non_text_X_trainval.shape)
print("[Non-Text] X_test: \t", non_text_X_test.shape)

[Text] X_trainval: 	 (97478,) (97478,)
[Text] X_test: 		 (32493,) (32493,)
[Non-Text] X_trainval: 	 (97478, 9)
[Non-Text] X_test: 	 (32493, 9)


## 1.1: Baseline Model using only Non-Text Features

#### Preprocessing:
- Imputation of missing values for both categorical and continuous non-text features
- OHE for categorical non-text features, StandardScaler for continuous non-text features

#### Model(s) tried: 
- Ridge with 5-fold cross-validation
- Ridge with 5-fold cross-validation and grid search on alpha

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [14]:
nt_continuous_features = ['price']
nt_categorical_features = ['country', 'title', 'designation', 'province', 'region_1', 'region_2',
                        'variety', 'winery']

In [15]:
base_categorical_transformer = Pipeline(steps=[('simpleimputer', SimpleImputer(strategy = 'constant', fill_value = "missing")),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

base_continuous_transformer = Pipeline(steps=[('simpleimputer', SimpleImputer(strategy = "median")),
                                         ('scaler', StandardScaler())])

base_continuous_and_categorical_preprocessor = ColumnTransformer(
    transformers=[
        ('num', base_continuous_transformer, nt_continuous_features),
        ('cat', base_categorical_transformer, nt_categorical_features)
    ])

In [19]:
%%time
# Baseline Ridge CV only
base_ridge_clf = Pipeline(steps=[('preprocessor', base_continuous_and_categorical_preprocessor),
                                 ('regressor', Ridge())])

base_ridge_scores = cross_val_score(base_ridge_clf, non_text_X_trainval, y_trainval, cv=5)

print("Baseline Ridge with only CV: %.2f" % np.mean(base_ridge_scores))

Baseline Ridge with only CV: 0.47
CPU times: user 4min 14s, sys: 1.65 s, total: 4min 15s
Wall time: 4min 21s


In [11]:
%%time
# Baseline Ridge GridSearchCV 
ridge_pipeline = Pipeline(steps=[('preprocessor', base_continuous_and_categorical_preprocessor),
                                ('regressor', Ridge())])

param_grid =  {
               'regressor__alpha': [0.01, 0.1, 1, 10, 100]
              }
grid = GridSearchCV(ridge_pipeline, param_grid, cv=5, return_train_score=True)
grid.fit(non_text_X_trainval, y_trainval)

print(("Baseline Ridge score with GridSearchCV: %.2f"
       %grid.score(non_text_X_test, y_test)))
print("Best param:", grid.best_params_)

Baseline Ridge score with GridSearchCV 0.53
Best param: {'regressor__alpha': 1}
CPU times: user 5min 9s, sys: 1min 19s, total: 6min 28s
Wall time: 4min 6s


#### Analysis: 
- A baseline ridge model with GridSearchCV performs better (0.53) than a baseline ridge model with just cross-validation (0.47).

## 1.2: Text-based Model based on BOW and a Linear Model

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
bow_vect = CountVectorizer(stop_words="english", max_features=1000)

X_trainval_bow = bow_vect.fit_transform(text_X_trainval)
X_test_bow = bow_vect.transform(text_X_test)

In [30]:
# Debugging
print("[BoW Text] X_trainval: \t", X_trainval_bow.shape)
print("[BoW Text] X_test:\t", X_test_bow.shape)

[BoW Text] X_trainval: 	 (97478, 1000)
[BoW Text] X_test:	 (32493, 1000)


In [26]:
%%time
ridge_pipeline = Pipeline(steps=[('regressor', Ridge())])

param_grid =  {
               'regressor__alpha': [0.01, 0.1, 1, 10, 100]
              }
grid = GridSearchCV(ridge_pipeline, param_grid, cv=3, return_train_score=True)
grid.fit(X_trainval_bow, y_trainval)

print(("Text-based Ridge score with GridSearchCV %.2f"
       %grid.score(X_test_bow, y_test)))
print("Best param:", grid.best_params_)

Text-based Rigde score with GridSearchCV 0.61
Best param: {'regressor__alpha': 10}
CPU times: user 19.4 s, sys: 346 ms, total: 19.7 s
Wall time: 20.2 s


#### Analysis:
- A text-based Ridge model with GridSearchCV outperforms (0.61) the baseline models from 1.1 (0.47, 0.53).
- Even though a BOW approach discards any information about the order or structure of words in the document, we find that the vocabulary of wine reviews itself is helpful in predicting the score.

## 1.3: More text-based models: 
We decided to explore the following two approaches to tune the BoW model: 
- Bigram/unigram models with stemming and removing stopwords
- Expanding on the n-gram model above using tf-idf stemming

#### 1. N-gram model with stemming

In [33]:
'''
Citation for this stemming function:
http://jonathansoma.com/lede/algorithms-2017/classes/more-text-analysis/counting-and-stemming/?fbclid=IwAR12cl7yIbpr-TXlS5IGukULm9kR30v_jocnRHAH5efFAECwTLlI7mt_yDw
'''
from textblob import TextBlob
def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

In [34]:
%%time
bigram_vect = CountVectorizer(stop_words = 'english', tokenizer = textblob_tokenizer, 
                              ngram_range=(1, 2), max_features = 15000) 

CPU times: user 29 µs, sys: 1 µs, total: 30 µs
Wall time: 34.1 µs


In [35]:
%%time
X_trainval_bigram = bigram_vect.fit_transform(text_X_trainval)
X_test_bigram = bigram_vect.transform(text_X_test)

CPU times: user 3min 42s, sys: 1.39 s, total: 3min 43s
Wall time: 3min 51s


In [37]:
%%time
ridge_pipeline = Pipeline(steps=[('regressor', Ridge())])

param_grid =  {
               'regressor__alpha': [0.01, 0.1, 1, 10, 100]
              }
grid = GridSearchCV(ridge_pipeline, param_grid, cv=3, return_train_score=True)
grid.fit(X_trainval_bigram, y_trainval)

print(("N-gram text-based Rigde model score: %.2f"
       %grid.score(X_test_bigram, y_test)))
print("Best param:", grid.best_params_)

N-gram text-based Rigde model score: 0.68
Best param: {'regressor__alpha': 100}
CPU times: user 4min 27s, sys: 1.82 s, total: 4min 29s
Wall time: 4min 34s


#### 2. Supplement the N-gram model with TF-IDF stemming

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [38]:
%%time
bigram_tfidf = make_pipeline(bigram_vect,
                             TfidfTransformer(),
                            )

X_trainval_bigram_tfidf = bigram_tfidf.fit_transform(text_X_trainval)

X_test_bigram_tfidf = bigram_tfidf.transform(text_X_test)

ridge_pipeline = Pipeline(steps=[('regressor', Ridge())])

param_grid =  {
               'regressor__alpha': [0.01, 0.1, 1, 10, 100]
              }
grid = GridSearchCV(ridge_pipeline, param_grid, cv=3, return_train_score=True)
grid.fit(X_trainval_bigram_tfidf, y_trainval)

print(("N-gram text-based Ridge with TF-IDF stemming score: %.2f"
       %grid.score(X_test_bigram_tfidf, y_test)))
print("Best param:", grid.best_params_)


N-gram text-based Ridge with TF-IDF stemming score: 0.69
Best param: {'regressor__alpha': 1}
CPU times: user 5min, sys: 2.15 s, total: 5min 2s
Wall time: 5min 9s


#### Analysis
- Using n-grams with stop words improves the model's performance considerable (about 8%). We believe this performs better than a BoW model because it captures combinations of words associated with higher/lower scores. 
- Supplementing the n-gram model with TF-IDF stemming improves the performance even more (about 1%).

## 1.4: Combining Text and Non-Text Features

- We use similar preprocessing as the one used earlier with standard scaling of continuous features, OHE for categorical features and imputation of all missing values.
- In addition to a RidgeRegressor, we tried XGBRegressor as well see if there was a considerable difference in performance.

In [39]:
#Convert sparse matrix to df
X_trainval_bigram_tfidf = pd.DataFrame(X_trainval_bigram_tfidf.toarray())
X_test_bigram_tfidf = pd.DataFrame(X_test_bigram_tfidf.toarray())

In [42]:
%%time
#Merge non-text and BOW and test
full_X_trainval = pd.concat([non_text_X_trainval, X_trainval_bigram_tfidf], axis=1)
full_X_test = pd.concat([non_text_X_test, X_test_bigram_tfidf],axis=1)

CPU times: user 1min 34s, sys: 27 s, total: 2min 1s
Wall time: 2min 22s


In [43]:
base_categorical_transformer = Pipeline(steps=[('simpleimputer', SimpleImputer(strategy = 'constant', fill_value = "missing")),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])

base_continuous_transformer = Pipeline(steps=[('simpleimputer', SimpleImputer(strategy = "median")),
                                         ('scaler', StandardScaler())])

base_continuous_and_categorical_preprocessor = ColumnTransformer(
    transformers=[
        ('num', base_continuous_transformer, nt_continuous_features),
        ('cat', base_categorical_transformer, nt_categorical_features)
    ])

In [44]:
%%time
transformed_full_X_trainval = base_continuous_and_categorical_preprocessor.fit_transform(full_X_trainval)
transformed_full_X_test = base_continuous_and_categorical_preprocessor.transform(full_X_test)

CPU times: user 24.1 s, sys: 38.5 s, total: 1min 2s
Wall time: 1min 20s


In [45]:
# Debugging
print("[Text & Non-Text] X_trainval: ", transformed_full_X_trainval.shape)
print("[Text & Non-Text] X_test: ",transformed_full_X_test.shape)

[Text & Non-Text] X_trainval:  (97478, 140635)
[Text & Non-Text] X_test:  (32493, 140635)


In [46]:
%%time
ridge_pipeline = Pipeline(steps=[
                                ('regressor', Ridge())])

param_grid =  {
               'regressor__alpha': [0.01, 0.1, 1, 10, 100]
              }
grid = GridSearchCV(ridge_pipeline, param_grid, cv=3, return_train_score=True)
grid.fit(transformed_full_X_trainval, y_trainval)

print(("Text and non-text hybrid Ridge model score %.2f"
       %grid.score(transformed_full_X_test, y_test)))
print("Best param:", grid.best_params_)

Text and non-text hybrid Ridge model score 0.49
Best param: {'regressor__alpha': 0.01}
CPU times: user 12min 8s, sys: 5.15 s, total: 12min 13s
Wall time: 13min 20s


In [24]:
from xgboost import XGBRegressor

In [26]:
%%time
xgb_pipeline = Pipeline(steps=[
                                ('regressor', XGBRegressor())])

param_grid =  {
               "regressor__max_depth": [4,8], 
                "regressor__alpha": [0, 0.5, 2], 
                "regressor__lambda": [0.5, 1.5],
              }
grid = GridSearchCV(xgb_pipeline, param_grid, cv=3, return_train_score=True)
grid.fit(transformed_full_X_trainval, y_trainval)

print(("XGBoost with GridSearchCV %.2f"
       %grid.score(transformed_full_X_test, y_test)))
print("Best param:", grid.best_params_)

XGBoost with GridSearchCV 0.47
Best param: {'regressor__alpha': 2, 'regressor__lambda': 0.5, 'regressor__max_depth': 8}
CPU times: user 27min 44s, sys: 15.4 s, total: 28min
Wall time: 34min 18s


#### Analysis
- Combining text and non-text features lowers the performance for both Ridge and XGBRegressor considerably as opposed to a pure text-based model (approx. 17%). 
- XGBoost is also far slower to train than Ridge (approx. 13x longer) in addition to performing poorer.