# RANDOM FOREST REGRESSOR (BLIND TASTING - PREDICT PRICE)

### 1. Import Libraries

In [1]:
import re

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None 

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

from scipy.sparse import hstack
from scipy.stats import pearsonr

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.svm import LinearSVC
from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import utils
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

### 2. Load Data and Preprocess It

In [2]:

# df = pd.read_csv("../data/winemag-data-130k-v2.csv", nrows=50000,index_col=0)
df = pd.read_csv("../data/winemag-data_first150k.csv", nrows=50000, index_col=0)

In [3]:
# Drop columns
df = df.drop(['designation', 'province', 'region_1', 'region_2', 'winery'], axis = 1)

# Test and remove duplicates
df[df.duplicated('description',keep=False)].head()
df = df.drop_duplicates('description')

In [4]:
# Test for null values
df.apply(lambda x: sum(x.isnull()),axis=0)

country           2
description       0
points            0
price          4102
variety           0
dtype: int64

In [5]:
# Fill null prices with mean value
df['price'].fillna(df['price'].mean(), inplace=True)
df.apply(lambda x: sum(x.isnull()),axis=0) 

country        2
description    0
points         0
price          0
variety        0
dtype: int64

In [6]:
# Drop those 2 rows without country value filled in
df.dropna(axis='rows',inplace=True)
df.apply(lambda x: sum(x.isnull()),axis=0) 

country        0
description    0
points         0
price          0
variety        0
dtype: int64

In [7]:
# Preprocess 
#   - Lower each word
#   - Strip and split
#   - Remove stopwords
#   - Stem step (bring each word to it's base form)
#   - Join again

stopword_list = stopwords.words('english')
ps = PorterStemmer()
for i in range(0,len(df['description'])):
    try:
        description = re.sub('[^a-zA-Z]',' ',df['description'][i])
        description = description.lower().strip()
        description_words = description.split()
        description_words = [word for word in description_words if not word in stopword_list]
        description_words = [ps.stem(word) for word in description_words]
        df['description'][i] = ' '.join(description_words)
    except:
        pass

In [8]:
# Drop wine types with less than 300 observations

df = df.groupby('variety').filter(lambda x: len(x) > 200)
filtered = df.groupby('variety').filter(lambda x: len(x) >= 500)

In [9]:
# Encode varieties as integers

filtered['variety_id'] = filtered['variety'].factorize()[0]
category_id_df = filtered[['variety', 'variety_id']].drop_duplicates().sort_values('variety_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['variety_id', 'variety']].values)

filtered.head()

Unnamed: 0,country,description,points,price,variety,variety_id
0,US,tremend variet wine hail oakvil age three year...,96,235.0,Cabernet Sauvignon,0
2,US,mac watson honor memori wine made mother treme...,96,90.0,Sauvignon Blanc,1
3,US,spent month new french oak incorpor fruit ponz...,96,65.0,Pinot Noir,2
8,US,name vineyard formerli bottl delancellotti fin...,95,65.0,Pinot Noir,2
9,US,produc sourc two block vineyard wine one high ...,95,60.0,Pinot Noir,2


### 3. Random Forest Regressor for Price Prediction

In [10]:
df = df.reset_index()

X = df.drop(['country','description','variety'], axis = 1)
y = df.price

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X = X.as_matrix().astype(np.float)
y = y.as_matrix().astype(np.float)

(28404, 3) (9469, 3) (28404,) (9469,)


  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [11]:

clf = RandomForestRegressor(n_estimators=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy Score:', clf.score(X_test, y_test) *100, "%")

Accuracy Score: 99.75016579906396 %
