In [1]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import QuantileTransformer  
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
import sys
sys.path.append('..')
from WinePredictModel.data import GetData
from WinePredictModel.encoder import (
    YearVintageEncoder,WeatherEncoder,FeatureSelectionEncoder,PriceImputer,
    DescriptionSentimentEncoder,
    VocabRichnessEncoder,
    TitleLengthEncoder,
    PriceBinEncoder,
    FeatureSelectionEncoder,
    CreateDummies,
YearReturnEnconder)
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/edwardburroughes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/edwardburroughes/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [3]:
d = GetData('gcp')

In [4]:
df = d.clean_data()
df.shape

(75865, 10)

In [18]:
df['price'].median()

27.0

In [13]:
CAT_FEATURES = ["province", "variety", "country", "winery", "region_1"]
for col in CAT_FEATURES:
    test = df[col].value_counts()
    print(col)
    print(test[test.index.str.contains("Other|other")])

province
France Other       613
Other              506
Australia Other    183
Italy Other         97
Spain Other         58
Canada Other         1
Name: province, dtype: int64
variety
Series([], Name: variety, dtype: int64)
country
Series([], Name: country, dtype: int64)
winery
Bret Brothers               17
Krupp Brothers              17
Gordon Brothers             15
Frei Brothers               12
Brotherhood                 10
Three Brothers              10
Kay Brothers                10
Brengman Brothers            4
Pagter Brothers              3
Maley Brothers Vineyards     2
Agly Brothers                2
Two Brothers                 2
Name: winery, dtype: int64
region_1
California Other    5
Oregon Other        2
Name: region_1, dtype: int64


In [16]:
df["region_1"].value_counts()

Columbia Valley (WA)                         3794
Mendoza                                      2150
Willamette Valley                            2086
Alsace                                       1888
Napa Valley                                  1677
Russian River Valley                         1486
Champagne                                    1449
Finger Lakes                                 1384
Rioja                                        1332
Paso Robles                                  1181
California                                   1112
Barolo                                       1031
Walla Walla Valley (WA)                       996
Yakima Valley                                 911
Sonoma Coast                                  776
Côtes de Provence                             756
Brunello di Montalcino                        732
Chianti Classico                              715
Red Mountain                                  665
Ribera del Duero                              665


In [8]:
test = df["province"].value_counts()
test[test.index.str.contains("Other|other")]

France Other       613
Other              506
Australia Other    183
Italy Other         97
Spain Other         58
Canada Other         1
Name: province, dtype: int64

In [5]:
df.isnull().sum()
df.dropna(subset=['taster_name','region_1','taster_name'],inplace=True)

In [6]:
X = df.drop(columns='points')
y = df['points']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [7]:
CAT_FEATURES = ["province", "variety", "country", "winery", "region_1"]
pipe_sentiment = make_pipeline(
            DescriptionSentimentEncoder(description="description"),
            QuantileTransformer(),
)
pipe_title_length = make_pipeline(
    TitleLengthEncoder(taster_name="taster_name", title="title"),
    QuantileTransformer(),
)
pipe_vocab_richness = make_pipeline(
    VocabRichnessEncoder(description="description"), QuantileTransformer()
)
price_bin = make_pipeline(PriceBinEncoder(price="price"), OneHotEncoder(handle_unknown='ignore'))

pipe_weather = make_pipeline(
            WeatherEncoder('country','year'),
            SimpleImputer(strategy = 'median'),
            QuantileTransformer()
        )
# Define default feature engineering blocs
feateng_blocks = [
    ("weather", pipe_weather, ["country","year"]),
    ("year",YearReturnEnconder("year"),["year"]),
    ("price_quan",QuantileTransformer(),["price"]),
    ("description_sentiment", pipe_sentiment, ["description"]),
    ("title_length", pipe_title_length, ["taster_name","title"]),
    ("vocab_richness", pipe_vocab_richness, ["description"]),
    ("price_bin", price_bin, ["price"]),
    ("categorical",OneHotEncoder(handle_unknown='ignore'),CAT_FEATURES)]

features_encoder = ColumnTransformer(
            feateng_blocks, n_jobs=None,
            remainder='drop'
        )

pipeline_feature = Pipeline(
            steps=[
                ('year', YearVintageEncoder(title="title")),
                ('price_impute',PriceImputer(price='price')),
                ('feat_eng',features_encoder),
            ]
        )

In [8]:
from sklearn import set_config; set_config(display='diagram')
pipeline_feature

In [9]:
X_train_preproc = pipeline_feature.fit_transform(X_train)
#bm = BorderlineSMOTE(sampling_strategy='minority',k_neighbors=1,m_neighbors=20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['year'] = X[self.title].str.extract('(\d+)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X["year"] = pd.to_numeric(X["year"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  (X["year"] >= 2021) | (X["year"] <= (2021 - 70)), np.nan, X["year"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [10]:
from imblearn.over_sampling import BorderlineSMOTE
bm = BorderlineSMOTE(sampling_strategy='minority',k_neighbors=1,m_neighbors=20)
X_train_smote,y_train_smote = bm.fit_resample(X_train_preproc,y_train)

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_smote,y_train_smote)

In [12]:
X_test_preproc = pipeline_feature.transform(X_test)
y_pred = rf.predict(X_test_preproc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['year'] = X[self.title].str.extract('(\d+)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X["year"] = pd.to_numeric(X["year"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  (X["year"] >= 2021) | (X["year"] <= (2021 - 70)), np.nan, X["year"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexe

In [13]:
from sklearn.metrics import f1_score
f1_score(y_test,y_pred,average='weighted')

0.5991834935474535

In [14]:
params ={'bootstrap': [True, False],
         'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
          'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_random_search = RandomizedSearchCV(rf,
                                       param_distributions=params,
                                       cv=5,
                                       scoring='f1_weighted',
                                       verbose=5,
                                       n_iter = 10,
                                      random_state=1)
rf_random_search.fit(X_train_smote,y_train_smote)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  57.7s
[CV 2/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  56.9s
[CV 3/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  57.0s
[CV 4/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  57.0s
[CV 5/5] END bootstrap=True, max_depth=60, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time=  57.9s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time= 1.6min
[CV 2/5] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=2, min_samples_spl

In [17]:
print(f"model best score {rf_random_search.best_score_}")
print(f"model best params {rf_random_search.best_params_}")
params = rf_random_search.best_params_

model best score 0.7063456222873483
model best params {'n_estimators': 1600, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 100, 'bootstrap': False}


In [25]:
features = pd.DataFrame(zip(list(X_train.columns),list(rf_random_search.best_estimator_.feature_importances_)))
features.columns = ['features','scores']

In [26]:
features.to_csv('feature_selection.csv')