## Applied Machine Learning HW 4

#### Hojin Lee (hl3328) & Hyuk Joon Kwon (hk3084)

In [1]:
import time
import pandas as pd
import numpy as np
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import category_encoders as ce


In [2]:
#reading csv file
wine_df_raw = pd.read_csv(r'wine-reviews/winemag-data-130k-v2.csv')

In [3]:
wine_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


In [4]:
#leaving all other countries but US
df_us = wine_df_raw[wine_df_raw['country'] == 'US']
df_us.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54504 entries, 2 to 129967
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             54504 non-null  int64  
 1   country                54504 non-null  object 
 2   description            54504 non-null  object 
 3   designation            36908 non-null  object 
 4   points                 54504 non-null  int64  
 5   price                  54265 non-null  float64
 6   province               54504 non-null  object 
 7   region_1               54226 non-null  object 
 8   region_2               50511 non-null  object 
 9   taster_name            37730 non-null  object 
 10  taster_twitter_handle  34741 non-null  object 
 11  title                  54504 non-null  object 
 12  variety                54504 non-null  object 
 13  winery                 54504 non-null  object 
dtypes: float64(1), int64(2), object(11)
memory usage: 6.2

## Task1

In [5]:
#dropping null values & setting sample size as 50000
dropna = True
sample_size = 50000

In [6]:
print('Categorical feature:',list(df_us.select_dtypes(include=['object']).columns))
print('Continuous feature:',list(df_us.select_dtypes(exclude=['object']).columns))

Categorical feature: ['country', 'description', 'designation', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery']
Continuous feature: ['Unnamed: 0', 'points', 'price']


In [7]:
#printing the number of unique values for each feature
for col in list(df_us.select_dtypes(include=['object']).columns):
    print(col, len(df_us[col].unique()))

country 1
description 50449
designation 14184
province 27
region_1 265
region_2 18
taster_name 16
taster_twitter_handle 13
title 50229
variety 257
winery 5375


### Task 1.1 Create a baseline model for predicting wine quality using only non-text features.

In [8]:
# dropping couple features. Unnamed: 0 is a meaningless feature. Country is not helpful as we only filtered US.
# taster_twitter_handle will have high collinearlity with taster_name. Province and region_2 will have high 
# collinearilty with region_1.
# title and description is almost unique identifier.
drop_cols = ['Unnamed: 0','country','taster_twitter_handle', 'province', 'region_2', 'title', 'description']

df_drop = df_us.drop(drop_cols, axis=1)

if dropna:
    df = df_drop.dropna(subset=['points'])
else:
    df = df_drop

df_sample = df.sample(n=sample_size, random_state=1)

In [9]:
X = df_sample.loc[:, df_sample.columns != 'points']
y = df_sample['points']

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [10]:
category = X_train.dtypes == object

cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='NaN'),
    OneHotEncoder(handle_unknown='ignore'))

cont_preprocessing = make_pipeline(
    SimpleImputer())

cont_preprocessing_scale = make_pipeline(
    SimpleImputer(),
    StandardScaler())

target_encoder = make_pipeline(
    ce.TargetEncoder()
    , StandardScaler())

te_feature = ['designation','winery']
cont_feature = list(X_train.select_dtypes(exclude=['object']).columns)
cat_feature = list(X_train.select_dtypes(include=['object']).columns)

preprocess = make_column_transformer(
    (cont_preprocessing, cont_feature)
    , (cat_preprocessing, cat_feature)
    , remainder ='passthrough')

preprocess_scale = make_column_transformer(
    (cont_preprocessing_scale, cont_feature)
    , (cat_preprocessing, cat_feature)
    , remainder ='passthrough')

cat_feature = list(set(list(X_train.select_dtypes(include=['object']).columns)) - set(te_feature))
preprocess_scale_te = make_column_transformer(
    (target_encoder, te_feature)
    , (cont_preprocessing_scale, cont_feature)
    , (cat_preprocessing, cat_feature)
    , remainder ='passthrough')

def pipeline_prediction(X, y, preprocess, regression):
    OLR_pipe = make_pipeline(preprocess, regression)
    scores_olr = cross_val_score(OLR_pipe, X, y, cv=5)
    return np.mean(scores_olr)

In [11]:
results_dict = {}
methods = [LinearRegression(), Ridge()]
processors = [preprocess, preprocess_scale, preprocess_scale_te]
method_name = ['Linear_regression', 'Ridge']
processors_name = ['preprocess','preprocess_with_scaler','preprocess_with_scaler & te']

processor_counter = 0

for processor in processors:
    method_counter = 0
    results_dict[processors_name[processor_counter]] = {}
    
    for method in methods:
        
        results_dict[processors_name[processor_counter]][method_name[method_counter]] = pipeline_prediction(X_train, y_train, processor, method)
        method_counter += 1
        
    processor_counter += 1
        
results_df = pd.DataFrame.from_dict(results_dict)
print(tabulate(results_df, headers='keys', tablefmt='psql'))


+-------------------+--------------+--------------------------+-------------------------------+
|                   |   preprocess |   preprocess_with_scaler |   preprocess_with_scaler & te |
|-------------------+--------------+--------------------------+-------------------------------|
| Linear_regression |     0.33972  |                 0.33963  |                      0.446684 |
| Ridge             |     0.444749 |                 0.462038 |                      0.448081 |
+-------------------+--------------+--------------------------+-------------------------------+


### Task 1.2 Create a simple text-based model using a bag-of-words approach and a linear model.

In [14]:
text_trainval, y_trainval = df_us['description'], df_us['points']
text_train, text_val, y_train, y_val = train_test_split(text_trainval, y_trainval)

In [17]:
vect = CountVectorizer(token_pattern=r"\b\w+\b", max_features = 5000)
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)

In [18]:
results_dict = {}
methods = [LinearRegression(), Ridge()]
processors = [CountVectorizer]
method_name = ['Linear_regression', 'Ridge']
processors_name = ['Count Vectorizer']

processor_counter = 0

for processor in processors:
    method_counter = 0
    results_dict[processors_name[processor_counter]] = {}
    
    for method in methods:
        
        results_dict[processors_name[processor_counter]][method_name[method_counter]] = method.fit(X_train, y_train).score(X_val, y_val)
        method_counter += 1
        
    processor_counter += 1
        
results_df = pd.DataFrame.from_dict(results_dict)
print(tabulate(results_df, headers='keys', tablefmt='psql'))


+-------------------+--------------------+
|                   |   Count Vectorizer |
|-------------------+--------------------|
| Linear_regression |           0.692312 |
| Ridge             |           0.69573  |
+-------------------+--------------------+


### Task 1.3 Try using n-grams, characters, tf-idf rescaling and possibly other ways to tune the BoW model. Be aware that you might need to adjust the (regularization of the) linear model for different feature sets.

In [19]:
df_sample = df_us.sample(10000)
text_trainval, y_trainval = df_sample['description'], df_sample['points']
text_train, text_val, y_train, y_val = train_test_split(text_trainval, y_trainval)

In [20]:
#n-grams, character, regression, grid search
param_grid = {
    "countvectorizer__ngram_range":[(1,1),(1,3),(1,4)]
    , "countvectorizer__min_df":[1,3,5]
    , "ridge__alpha": [100, 10, 1, 0.1, 0.01]
    , "countvectorizer__max_features": [1000, 3000, 5000]
    , "normalizer": [None, Normalizer()]
    
}

start_time = time.time()

grid = GridSearchCV(
    make_pipeline(CountVectorizer(stop_words='english'), Normalizer(), Ridge()), param_grid=param_grid, cv=5
)

grid.fit(text_train, y_train)
print('best score: ',grid.best_score_)
print('best param: ',grid.best_params_)
print("--- %s seconds ---" % (time.time() - start_time))

best score:  0.6256811727204594
best param:  {'countvectorizer__max_features': 5000, 'countvectorizer__min_df': 1, 'countvectorizer__ngram_range': (1, 1), 'normalizer': None, 'ridge__alpha': 10}
--- 1465.7394850254059 seconds ---


In [21]:
#tf-idf rescaling
start_time = time.time()
param_grid = {
    "tfidfvectorizer__ngram_range": [(1,1), (1,3), (1,4)]
    , "tfidfvectorizer__min_df": [1, 3, 5]
    , "ridge__alpha": [100, 10, 1, 0.1, 0.01]
    , "tfidfvectorizer__max_features": [3000,5000]
    , "normalizer": [None, Normalizer()]   
}

grid = GridSearchCV(
    make_pipeline(TfidfVectorizer(stop_words='english'), Normalizer(), Ridge()), param_grid=param_grid, cv=5

)
grid.fit(text_train, y_train)
print('best score: ',grid.best_score_)
print('best param: ',grid.best_params_)
print("--- %s seconds ---" % (time.time() - start_time))

best score:  0.6029701521223958
best param:  {'normalizer': Normalizer(copy=True, norm='l2'), 'ridge__alpha': 1, 'tfidfvectorizer__max_features': 5000, 'tfidfvectorizer__min_df': 3, 'tfidfvectorizer__ngram_range': (1, 1)}
--- 963.1393620967865 seconds ---


### Task 1.4 Combine the non-text features and the text features. How does adding those features improve upon just using bag-of-words?

In [22]:
drop_cols = ['Unnamed: 0','country','taster_twitter_handle', 'province', 'region_2', 'title']

df_drop = df_us.drop(drop_cols, axis=1)

X = df_drop.loc[:, df_drop.columns != 'points']
y = df_drop['points']

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [23]:
text_preprocessing = make_pipeline(
    CountVectorizer(max_features = 5000, min_df=1, ngram_range=(1,1), stop_words='english')
    )

te_feature = ['designation','winery']
text_feature = ['description']
cont_feature = list(X_train.select_dtypes(exclude=['object']).columns)
cat_feature = list(set(list(X_train.select_dtypes(include=['object']).columns)) - set(te_feature) - set(text_feature))

preprocess_text = make_column_transformer(
    (target_encoder, te_feature)
    , (cont_preprocessing_scale, cont_feature)
    , (cat_preprocessing, cat_feature)
    , (text_preprocessing, 'description')
    , remainder ='passthrough')

In [24]:
results_dict= {}
methods = [LinearRegression(), Ridge(alpha=10)]
processors = [preprocess_text]

method_name = ['Linear_regression', 'Ridge']
processors_name = ['preprocess_with_Text']

processor_counter = 0

for processor in processors:
    method_counter = 0
    results_dict[processors_name[processor_counter]] = {}
    
    for method in methods:
        
        results_dict[processors_name[processor_counter]][method_name[method_counter]] = pipeline_prediction(X_train, y_train, processor, method)
        method_counter += 1
        
    processor_counter += 1
        
results_df = pd.DataFrame.from_dict(results_dict)
print(tabulate(results_df, headers='keys', tablefmt='psql'))



+-------------------+------------------------+
|                   |   preprocess_with_Text |
|-------------------+------------------------|
| Linear_regression |               0.753456 |
| Ridge             |               0.757582 |
+-------------------+------------------------+


## Task2 

Use a pretrained word-embedding (word2vec, glove or fasttext) for featurization instead of the bag-of-words model. Does this improve classification? How about combining the embedded words with the BoW model?

In [54]:
import spacy
import nltk
import re
from nltk.corpus import stopwords


In [55]:
nlp = spacy.load('en_core_web_lg')

In [56]:
text_trainval, y_trainval = df_us['description'], df_us['points']

In [57]:
text_trainval = text_trainval.str.lower()

In [58]:
text_trainval = text_trainval.apply(lambda x : re.sub("[^a-z\s]","",x) )

In [59]:
stopwords = set(stopwords.words("english"))
text_trainval = text_trainval.apply(lambda x : " ".join(word for word in x.split() if word not in stopwords ))

In [60]:
document = nlp.pipe(text_trainval)

In [61]:
text_vector = np.array([text.vector for text in document])

In [62]:
X_ = X.reset_index(drop=True)
X_ = pd.concat([X_, pd.DataFrame(text_vector)], axis=1)

te = ce.TargetEncoder(cols=['designation','winery',]).fit(X_, y_trainval)
X_ = te.transform(X_)

In [63]:
text_train, text_val, y_train, y_val = train_test_split(X_, y_trainval)

In [64]:
text_feature = ['description']
cont_feature = ['price', 'designation','winery']
cat_feature = ['region_1','taster_name', 'variety']

preprocess_text = make_column_transformer(
    (cont_preprocessing_scale, cont_feature)
   , (cat_preprocessing, cat_feature)
   , (text_preprocessing, 'description')
   , remainder ='passthrough')

In [65]:
results_dict = {}
methods = [LinearRegression(), Ridge(alpha=10)]
processors = [preprocess_text]

method_name = ['Linear_regression', 'Ridge']
processors_name = ['preprocess_text']

processor_counter = 0

for processor in processors:
    method_counter = 0
    results_dict[processors_name[processor_counter]] = {}
    
    for method in methods:
        
        results_dict[processors_name[processor_counter]][method_name[method_counter]] = pipeline_prediction(text_train, y_train, processor, method)
        method_counter += 1
        
    processor_counter += 1
        
results_df = pd.DataFrame.from_dict(results_dict)
print(tabulate(results_df, headers='keys', tablefmt='psql'))



+-------------------+-------------------+
|                   |   preprocess_text |
|-------------------+-------------------|
| Linear_regression |          0.747052 |
| Ridge             |          0.75558  |
+-------------------+-------------------+
