In [6]:
##########################################################################
##### Template code and instructions on the basics of machine learning#
##########################################################################
### Step 1: install packages - 
##### packages are necessary to install and load, given that they have the built in functions necessary to run complex tasks. 
## They effectively act as one of the most crucial time saving activities that would otherwise lead to overly long and 
## duplicative scripts. 

### note: you are not expected to remember all of these; just for the best to copy and paste these sections 

## install pkgs 
import sys
import os
# !{sys.executable} -m pip install xgboost==1.7.5 # note: needed since it looks like anaconda installs an earlier version 
# of the package, which is not helpful. 1.7.5 allows for the categorical data of interest to be used. 

# !{sys.executable} -m pip install requests #; this code here can be used to install packages on anaconda/jupyter notebook 
### I believe the below should be installed by default 
import requests # web scraping 
from bs4 import BeautifulSoup # for web scraping 
import itertools # for efficient operation of loops 
import pandas as pd # necessary for reading in, creating, and manipulating data frames 
import csv ## for importing/exporting csvs 
import glob ## for finding files in path
import re
import numpy as np
### THe ML packages 
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

# nltk packages; these are for the purpose of cleaning text, which will be crucial 
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
import unidecode

In [13]:
### read in data 
#os.listdir("coding")
## link to tutorial: https://www.kaggle.com/code/diveki/classification-with-nlp-xgboost-and-pipelines/notebook
gabe_df = pd.read_csv("coding/faculty characteristics web scraping - osu_comments_gabe06192023.csv")
gabe_df = gabe_df.head(900)
gabe_df

Unnamed: 0,row,quality_of_class,difficulty_of_class,class_code,college,prof_firstname,prof_lastname,comment,out_misrep,out_emo_lang,...,pb_sex,pb_lgbtq,pb_race,pb_origin,pb_nuero_div,pb_phys_able,pb_pol_affil,complex,constructive,reflective
0,1,2.0,4,POLITSC3500,OHIO STATE UNIVERSITY,ALEX,ACS,"Only graded on 4 assignments (30% Midterm, 30%...",0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0
1,2,3.0,4,POLITSC3500,OHIO STATE UNIVERSITY,ALEX,ACS,"Final grade is only based on two exams, readin...",0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0
2,3,4.0,1,POLITSC1100,OHIO STATE UNIVERSITY,ALEX,ACS,Class was super easy. One reading quiz a week ...,0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0
3,4,4.0,2,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,"Lecture could be dry at times, but I still lik...",0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0
4,5,5.0,3,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,"Alex was an excellent lecturer. Insightful, ev...",0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,14,4.5,2,4232,OHIO STATE UNIVERSITY,CURTIS,HAUGTVEDT,"No friday class for all his classes. Instead, ...",0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0
896,15,5.0,2,4201,OHIO STATE UNIVERSITY,CURTIS,HAUGTVEDT,Pro. Curt is my mentor at OSU! Had 3 of his cl...,0,0.0,...,0.0,0,0,0.0,0,0,0.0,1,0,0
897,16,4.5,3,BUSML4232,OHIO STATE UNIVERSITY,CURTIS,HAUGTVEDT,World authority on Consumer Behavior. Very per...,0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0
898,17,5.0,2,BUSML4230,OHIO STATE UNIVERSITY,CURTIS,HAUGTVEDT,Very rarely have I came across a class that wa...,0,0.0,...,0.0,0,0,0.0,0,0,0.0,0,0,0


In [14]:
## the text column of interest is comment.
# note that we actually do want to keep upper case letters and such, along with punctuation. Therefore, we won't be 
# making to lower for the first time. We will want to remove accented letters and such 
def unidecode_text(text):
    try:
        #pdb.set_trace()
        text = unidecode.unidecode(text)
    except:
        pass
    return text

gabe_df["comment"] = gabe_df.apply(lambda row: unidecode_text(row["comment"]), axis=1)

In [17]:
### create stop words, which will be extracted 
import nltk
# nltk.download() # make  sure to run the first time 
stop_words = stopwords.words('english')
stop_append = ['.', ',', '`', '"', "'", ';', '%']
stop_words1 = stop_words + stop_append 

In [18]:
#### This consists entirely of helper text functions 
# list of word types (nouns and adjectives) to leave in the text
defTags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR']#, 'RB', 'RBS', 'RBR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# functions to determine the type of a word
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']


def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']


def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

# transform tag forms
def penn_to_wn(tag):
    if is_adjective(tag):
        return nltk.stem.wordnet.wordnet.ADJ
    elif is_noun(tag):
        return nltk.stem.wordnet.wordnet.NOUN
    elif is_adverb(tag):
        return nltk.stem.wordnet.wordnet.ADV
    elif is_verb(tag):
        return nltk.stem.wordnet.wordnet.VERB
    return nltk.stem.wordnet.wordnet.NOUN

#  lemmatizer + tokenizer (+ stemming) class
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        # we define (but not use) a stemming method, uncomment the last line in __call__ to get stemming tooo
        self.stemmer = nltk.stem.SnowballStemmer('english') 
    def __call__(self, doc):
        # pattern for numbers | words of length=2 | punctuations | words of length=1
        pattern = re.compile(r'[0-9]+|\b[\w]{2,2}\b|[%.,_`!"&?\')({~@;:#}+-]+|\b[\w]{1,1}\b')
        # tokenize document
        doc_tok = word_tokenize(doc)
        #filter out patterns from words
        doc_tok = [x for x in doc_tok if x not in stop_words1]
        doc_tok = [pattern.sub('', x) for x in doc_tok]
        # get rid of anything with length=1
        doc_tok = [x for x in doc_tok if len(x) > 1]
        # position tagging
        doc_tagged = nltk.pos_tag(doc_tok)
        # selecting nouns and adjectives
        doc_tagged = [(t[0], t[1]) for t in doc_tagged if t[1] in defTags]
        # preparing lemmatization
        doc = [(t[0], penn_to_wn(t[1])) for t in doc_tagged]
        # lemmatization
        doc = [self.wnl.lemmatize(t[0], t[1]) for t in doc]
        # uncomment if you want stemming as well
        #doc = [self.stemmer.stem(x) for x in doc]
        return doc

In [19]:
## now create the ngram vectorizer. We are going with a two gram tldf one 
vec_tdidf = TfidfVectorizer(ngram_range=(1,2), analyzer='word', #stop_words=stop_words1, 
                                               norm='l2', tokenizer=LemmaTokenizer())

In [20]:
## create the xgboost classifier
clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)

In [21]:
### helper fxn to identify text v numeric 
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        # returns the input as a string
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # returns the input as a dataframe
        return X[[self.key]]

In [22]:
## function to print out the stats 
def print_stats(preds, target, labels, sep='-', sep_len=40, fig_size=(10,8)):
    print('Accuracy = %.3f' % metrics.accuracy_score(target, preds))
    print(sep*sep_len)
    print('Classification report:')
    print(metrics.classification_report(target, preds))
    print(sep*sep_len)
    print('Confusion matrix')
    cm=metrics.confusion_matrix(target, preds)
    cm = cm / np.sum(cm, axis=1)[:,None]
    sns.set(rc={'figure.figsize':fig_size})
    sns.heatmap(cm, 
        xticklabels=labels,
        yticklabels=labels,
           annot=True, cmap = 'YlGnBu')
    plt.pause(0.05)

In [23]:
### now lets create a function that will pull in from the comments 

text = Pipeline([
                ('selector', TextSelector(key='comment')),
                ('vectorizer', vec_tdidf)
                ])

In [24]:
feats = FeatureUnion([('comment', text)
                      ])

pipe = Pipeline([('feats', feats),
                 ('clf',clf)
                 ])

In [25]:
### check the pipe object 
pipe.named_steps['clf'].get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.6,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': 0.7,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'seed': 2}

In [28]:
### now split up the training and test sets 
X_train, X_test, y_train, y_test = train_test_split(gabe_df["comment"], gabe_df["out_misrep"], 
                                                    test_size=0.33, random_state=42, stratify=gabe_df["out_misrep"])
## stratify ensures microcosom prop of outrage is assigned. 

In [29]:
# definition of parameter grid to scan through
param_grid = {
     'clf__n_estimators': [50,100,300]
#    'clf__colsample_bytree': [0.6,0.8,1]
#    'clf__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

In [37]:
# grid search cross validation instantiation
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 2, n_jobs = 1, verbose = 1, return_train_score=True)

In [42]:
### point of now return: try a model 
#hyperparameter fitting
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 3 candidates, totalling 6 fits


6 fits failed out of a total of 6.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\j-curiel\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3629, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas\_libs\index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 144, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index_class_helper.pxi", line 41, in pandas._libs.index.Int64Engine._check_type
KeyError: 'comment'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\j-curiel\Anaconda3\lib\si

KeyError: 'comment'

In [41]:
gabe_df['out_misrep'].unique()

array([0, 1], dtype=int64)

In [44]:
#### Let's just copy and paste the code from the tutorial here to see if we can get it to run 
data = pd.read_csv('coding/winemag-data-130k-v2.csv')
data.head(5)

  data = pd.read_csv('coding/winemag-data-130k-v2.csv')


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,grape_variety,winery,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,,,,
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,,,,
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,,,,
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,,,,
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,,,,


In [45]:
data_sel = data.drop(['Unnamed: 0','designation','points','region_2',], axis = 1)
data_sel.shape

(129971, 14)

In [46]:
data_single = data_sel.drop_duplicates('description')
data_single.shape

(119955, 14)

In [48]:
data_single = data_single.dropna(subset=['description', 'grape_variety', 'price'])

In [49]:
data_single.describe(include='all')

Unnamed: 0,country,description,price,province,region_1,taster_name,taster_twitter_handle,title,grape_variety,winery,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
count,111516,111571,111571,111511,93581,88271,83818,111555,111571,111571,124,55,1,1
unique,42,111571,605,456,1233,31,25,110515,820,15887,78,16,1,1
top,US,"This is ripe and fruity, a wine that is smooth...",20,California,Napa Valley,Roger Voss,@vossroger,R&A,Pinot Noir,Williams Selyem,R&A,Pfaffl,Bordeaux-style Red Blend,Hart 2 Hart
freq,50229,1,4907,33508,4170,18532,18532,32,11826,204,32,32,1,1


In [51]:
for col in ['grape_variety', 'description', 'province', 'region_1', 'winery', 'country']:
    data_single[col] = data_single[col].str.lower()
    
def unidecode_text(text):
    try:
        #pdb.set_trace()
        text = unidecode.unidecode(text)
    except:
        pass
    return text

for col in ['description', 'grape_variety', 'province', 'winery']:
    data_single[col] = data_single.apply(lambda row: unidecode_text(row[col]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_single[col] = data_single[col].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_single[col] = data_single.apply(lambda row: unidecode_text(row[col]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_single[col] = data_single.apply(lambda row: unidecode_text(row[col]),

In [52]:
data_single.grape_variety.value_counts()

pinot noir                                                     11826
chardonnay                                                     10227
cabernet sauvignon                                              8754
red blend                                                       7796
bordeaux-style red blend                                        4954
                                                               ...  
portuguiser                                                        1
zilavka                                                            1
riesling-chardonnay                                                1
kamiak 2006 merlot (yakima valley)                                 1
 pfaffl 2014 hommage reserve gruner veltliner (weinviertel)        1
Name: grape_variety, Length: 820, dtype: int64

In [57]:
filtered_name = ['red blend', 'portuguese red', 'white blend', 'sparkling blend', 'champagne blend', 
                 'portuguese white', 'rose', 'bordeaux-style red blend', 'rhone-style red blend',
                 'bordeaux-style white blend', 'alsace white blend', 'austrian red blend',
                 'austrian white blend', 'cabernet blend', 'malbec blend', 'portuguese rose',
                 'portuguese sparkling', 'provence red blend', 'provence white blend',
                 'rhone-style white blend', 'tempranillo blend', 'grenache blend',
                 'meritage' # beaurdaux blend
                ]
#data_single.rename(columns={"grape_variety":"variety"})

data_filtered = data_single.copy()
data_filtered = data_filtered[~data_filtered['grape_variety'].isin(filtered_name)]

In [60]:
def correct_grape_names(row):
    regexp = [r'shiraz', r'ugni blanc', r'cinsaut', r'carinyena', r'^ribolla$', r'palomino', r'turbiana', r'verdelho', r'viura', r'pinot bianco|weissburgunder', r'garganega|grecanico', r'moscatel', r'moscato', r'melon de bourgogne', r'trajadura|trincadeira', r'cannonau|garnacha', r'grauburgunder|pinot grigio', r'pinot noir|pinot nero', r'colorino', r'mataro|monastrell', r'mourv(\w+)']
    grapename = ['syrah', 'trebbiano', 'cinsault', 'carignan', 'ribolla gialla', 'palomino','verdicchio', 'verdejo','macabeo', 'pinot blanc', 'garganega', 'muscatel', 'muscat', 'muscadet', 'treixadura', 'grenache', 'pinot gris', 'pinot noir', 'lambrusco', 'mourvedre', 'mourvedre']
    f = row
    for exsearch, gname in zip(regexp, grapename):
        f = re.sub(exsearch, gname, f)
    return f

name_pairs = [('spatburgunder', 'pinot noir'), ('garnacha', 'grenache'), ('pinot nero', 'pinot noir'),
              ('alvarinho', 'albarino'), ('assyrtico', 'assyrtiko'), ('black muscat', 'muscat hamburg'),
              ('kekfrankos', 'blaufrankisch'), ('garnacha blanca', 'grenache blanc'),
              ('garnacha tintorera', 'alicante bouschet'), ('sangiovese grosso', 'sangiovese')
             ]

# df.rename(columns={"A": "a", "B": "c"})

data_corrected = data_filtered.copy()
data_corrected['grape_variety'] = data_corrected['grape_variety'].apply(lambda row: correct_grape_names(row))
for start, end in name_pairs:
    data_corrected['grape_variety'] = data_corrected['grape_variety'].replace(start, end) 
len(data_corrected.grape_variety.value_counts())

755

In [61]:
data_reduced = data_corrected.groupby('grape_variety').filter(lambda x: len(x) > 200)
data_reduced.shape

(74557, 14)

In [62]:
grapes = list(np.unique(data_reduced.grape_variety.value_counts().index.tolist()))

In [72]:
colour_map = {'aglianico': 'red', 'albarino': 'white', 'barbera': 'red', 'cabernet franc': 'red',
              'cabernet sauvignon': 'red', 'carmenere': 'red', 'chardonnay': 'white', 'chenin blanc': 'white',
              'corvina, rondinella, molinara': 'red', 'gamay': 'red', 'garganega': 'white', 
              'gewurztraminer': 'white', 'glera': 'white', 'grenache': 'red', 'gruner veltliner': 'white',
              'malbec': 'red', 'merlot': 'red', 'mourvedre': 'red', 'muscat': 'white', 'nebbiolo': 'red',
              "nero d'avola": 'red', 'petite sirah': 'red', 'pinot blanc': 'white', 'pinot gris': 'white',
              'pinot noir': 'red', 'port': 'red', 'prosecco': 'white', 'riesling': 'white', 'sangiovese': 'red',
              'sauvignon blanc': 'white', 'syrah': 'red', 'tempranillo': 'red', 'torrontes': 'white', 
              'verdejo': 'white', 'viognier': 'white', 'zinfandel': 'white', "petit verdot":'red',
              'sauvignon':'white', 'melon':'white', 'verdicchio':'white'
             }
## note: the code only works if it maps on perfectly; there can be no missing in the map
kaggle_input = data_reduced.copy()
kaggle_input
kaggle_input['colour'] = kaggle_input.apply(lambda row: colour_map[row['grape_variety']], axis=1)
colour_dummies = pd.get_dummies(kaggle_input['colour'])
kaggle_input = kaggle_input.merge(colour_dummies, left_index=True, right_index=True)

In [73]:
kaggle_input.reset_index(inplace=True)
kaggle_input.head()

Unnamed: 0,index,country,description,price,province,region_1,taster_name,taster_twitter_handle,title,grape_variety,winery,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,colour,red,white
0,2,us,"tart and snappy, the flavors of lime flesh and...",14,oregon,willamette valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),pinot gris,rainstorm,,,,,white,0,1
1,3,us,"pineapple rind, lemon pith and orange blossom ...",13,michigan,lake michigan shore,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,riesling,st. julian,,,,,white,0,1
2,4,us,"much like the regular bottling from 2012, this...",65,oregon,willamette valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,pinot noir,sweet cheeks,,,,,red,1,0
3,7,france,this dry and restrained wine offers spice in p...,24,alsace,alsace,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),gewurztraminer,trimbach,,,,,white,0,1
4,8,germany,savory dried thyme notes accent sunnier flavor...,12,rheinhessen,,Anna Lee C. Iijima,,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,gewurztraminer,heinz eifel,,,,,white,0,1


In [74]:
# stop words for countries
stop_country = list(np.unique(kaggle_input.country.dropna().str.lower().tolist()))

#stop words for province
stop_province = list(np.unique(kaggle_input.province.dropna().str.lower().tolist()))

#stop words for winery
stop_winery = list(np.unique(kaggle_input.winery.dropna().str.lower().tolist()))

In [75]:
# defining stopwords: using the one that comes with nltk + appending it with words seen from the above evaluation
stop_words = stopwords.words('english')
stop_append = ['.', ',', '`', '"', "'", '!', ';', 'wine', 'fruit', '%', 'flavour', 'aromas', 'palate']
stop_words1 = stop_words + stop_append + grapes + stop_country + stop_province + stop_winery

In [91]:
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words=stop_words1, 
                                               norm='l2', tokenizer=LemmaTokenizer())

In [92]:
clf = XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7)

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        # returns the input as a string
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # returns the input as a dataframe
        return X[[self.key]]

In [93]:
def print_stats(preds, target, labels, sep='-', sep_len=40, fig_size=(10,8)):
    print('Accuracy = %.3f' % metrics.accuracy_score(target, preds))
    print(sep*sep_len)
    print('Classification report:')
    print(metrics.classification_report(target, preds))
    print(sep*sep_len)
    print('Confusion matrix')
    cm=metrics.confusion_matrix(target, preds)
    cm = cm / np.sum(cm, axis=1)[:,None]
    sns.set(rc={'figure.figsize':fig_size})
    sns.heatmap(cm, 
        xticklabels=labels,
        yticklabels=labels,
           annot=True, cmap = 'YlGnBu')
    plt.pause(0.05)

In [94]:
text = Pipeline([
                ('selector', TextSelector(key='description')),
                ('vectorizer', vec_tdidf)
                ])

In [95]:
#pipelines of colour features
red = Pipeline([
                ('selector', NumberSelector(key='red')),
                ])
white = Pipeline([
                ('selector', NumberSelector(key='white')),
                ])

In [96]:
feats = FeatureUnion([('description', text),
                      ('red', red),
                      ('white', white)
                      ])

In [97]:
pipe = Pipeline([('feats', feats),
                 ('clf',clf)
                 ])

In [98]:
# split the data into train and test
combined_features = ['description', 'white', 'red']
target = 'grape_variety'

X_train, X_test, y_train, y_test = train_test_split(kaggle_input[combined_features], kaggle_input[target], 
                                                    test_size=0.33, random_state=42, stratify=kaggle_input[target])

In [99]:
# definition of parameter grid to scan through
param_grid = {
     'clf__n_estimators': [50,100,300]
#    'clf__colsample_bytree': [0.6,0.8,1]
#    'clf__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

In [100]:
### just going to see if pkgs are the problem and reload 
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

import warnings  
warnings.filterwarnings('ignore')

# importing packages
import pandas as pd
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

# nltk packages
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from string import punctuation
import unidecode

In [101]:
from nltk.stem import WordNetLemmatizer
grid_search = GridSearchCV(estimator = pipe, param_grid = param_grid, 
                          cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

#hyperparameter fitting ; where it did not work for me ; also broke down here 
grid_search.fit(X_train, y_train)

AttributeError: module 'nltk.stem.wordnet' has no attribute 'wordnet'

In [89]:
nltk.stem.wordnet.wordnet

AttributeError: module 'nltk.stem.wordnet' has no attribute 'wordnet'