In [12]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import QuantileTransformer  
from xgboost import XGBClassifier
import sys
sys.path.append('..')
from WinePredictModel.data import GetData
from WinePredictModel.encoder import (
    YearVintageEncoder,WeatherEncoder,FeatureSelectionEncoder,PriceImputer,
    DescriptionSentimentEncoder,
    VocabRichnessEncoder,
    TitleLengthEncoder,
    PriceBinEncoder,
    FeatureSelectionEncoder,
    CreateDummies,
YearReturnEnconder)
%matplotlib inline

In [13]:
d = GetData('gcp',nrows=25000)

In [14]:
df = d.clean_data()
df.shape

(24704, 10)

In [15]:
X = df.drop(columns='points')
y = df['points']

In [16]:
X.columns

Index(['country', 'description', 'price', 'province', 'region_1',
       'taster_name', 'title', 'variety', 'winery'],
      dtype='object')

In [17]:
cd = CreateDummies()
x_test = cd.fit_transform(X)

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
CAT_FEATURES = ["province", "variety", "country", "winery", "region_1"]
pipe_sentiment = make_pipeline(
            DescriptionSentimentEncoder(description="description"),
            QuantileTransformer(),
)
pipe_title_length = make_pipeline(
    TitleLengthEncoder(taster_name="taster_name", title="title"),
    QuantileTransformer(),
)
pipe_vocab_richness = make_pipeline(
    VocabRichnessEncoder(description="description"), QuantileTransformer()
)
price_bin = make_pipeline(PriceBinEncoder(price="price"), OneHotEncoder())

pipe_weather = make_pipeline(
            WeatherEncoder('country','year'),
            SimpleImputer(strategy = 'median'),
            QuantileTransformer()   
        )


# Define default feature engineering blocs
feateng_blocks = [
    ("weather", pipe_weather, ["country","year"]),
    ("year",YearReturnEnconder("year"),["year"]),
    ("description_sentiment", pipe_sentiment, ["description"]),
    ("title_length", pipe_title_length, ["taster_name","title"]),
    ("vocab_richness", pipe_vocab_richness, ["description"]),
    ("price_bin", price_bin, ["price"]),
    ("categorical",CreateDummies(),CAT_FEATURES)
]

In [8]:

feature_year_encoder = ColumnTransformer(
            feateng_blocks, n_jobs=None,remainder='drop'
        )

pipeline = Pipeline(steps=[
            ('feature',FeatureSelectionEncoder(threshold=1E-6)),
            ('year', YearVintageEncoder(title="title")),
            ('price_impute',PriceImputer(price='price')),
            ('feat_eng',feature_year_encoder),
            ('scaler',QuantileTransformer())
])

X_new = pipeline.fit_transform(X)

In [10]:
X_new[0].shape

(10441,)

In [46]:
fy = YearVintageEncoder(title="title")
X_new = fy.fit_transform(df)

In [47]:
X_new['year'].unique()

array([1970.,   nan])

In [13]:
list(range(5))

[0, 1, 2, 3, 4]