In [None]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.impute import SimpleImputer
# from sklearn.linear_model import LinearRegression
# from sklearn.ensemble import RandomForestRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.preprocessing import RobustScaler  
%matplotlib inline

In [None]:
def import_raw_data(operating_system ='mac'):
    '''generate a dictionary of raw dataframes
    
    parameters
    -----------
    type of operating system used windows or mac
    default mac
    
    '''
    if operating_system == 'mac':
        base_file_path = r"/Users/{}/Desktop/data".format(os.getlogin())
    if operating_system == 'windows':
        base_file_path = r"C:\Users\{}\Desktop\data".format(os.getlogin())
    df_dict = dict()
    for file in os.listdir(base_file_path):
        if file.endswith('.csv'):
            df_dict[file.split('.')[0]] = pd.read_csv(os.path.join(base_file_path,file))
    return df_dict

In [None]:
df_dict = import_raw_data()
df_dict.keys()
df = df_dict.get('winemag-data-130k-v2').copy()
temp_df = df_dict.get('temperature').copy()
country_iso = df_dict.get('country_iso_data').copy()
weather_month_v2 = df_dict.get('weather_country_month_v2').copy()

In [None]:
sns.countplot(x='test', color='darkgreen',
                  data=df)

In [None]:
COLUMN_DROP = ['Unnamed: 0','designation','region_2','taster_twitter_handle','points']
df.drop(columns=COLUMN_DROP,inplace=True)
df.dropna(subset=['variety','province','country','taster_name'],inplace=True)
imp_median = SimpleImputer(strategy='median')
df['price'] = imp_median.fit_transform(df[['price']])
df.isnull().sum()

In [None]:
df['region_1'] = df['region_1'].fillna("NONE")
df.drop_duplicates(subset=['description','title'],inplace=True)

In [None]:
count_year = df[df['title'].str.contains('\d',regex=True)].shape[0]
df['number_extract'] = df['title'].str.extract('(\d+)')
df['number_extract'] = np.where(len(df['number_extract'])<4 & len(df['number_extract'])<=5,np.nan,df['number_extract'])
df['number_extract'] = pd.to_numeric(df['number_extract'])
df['number_extract'] = np.where(
                    (df['number_extract']>=2021) | (df['number_extract']<=(2021-70)),
                    np.nan,
                    df['number_extract'])
imp_median = SimpleImputer(strategy='median')
df['number_extract'] = imp_median.fit_transform(df[['number_extract']])
df['number_extract'] = pd.to_datetime(df['number_extract'],format='%Y').dt.year 
df.rename(columns={'number_extract':'year'},inplace=True)
df.isnull().sum()

In [None]:
for col in ['title','taster_name']:
     df[f"{col}_length"] = df[col].apply(lambda x: len(x))
df['price_bin'] = pd.cut(df['price'],bins=15,labels=False)
df.drop(columns=['taster_name', 'title'],inplace=True)

In [None]:
weather_feature = df.set_index('country').join(country_iso.set_index('country'))
weather_iso_df = weather_month_v2.set_index('country').join(country_iso.set_index('country'))
weather_iso_df['year'] = pd.to_datetime(weather_iso_df['month']).dt.year
weather_iso_summary_df = weather_iso_df.groupby(['country_iso', 'year'], as_index=False).mean()
df = pd.merge(
              weather_feature, 
              weather_iso_summary_df,  
              how='left', 
              left_on=['country_iso','year'], 
              right_on=['country_iso','year']
             )
imp_median = SimpleImputer(strategy='median')
df['avg_temp'] = imp_median.fit_transform(df[['avg_temp']])
df.isnull().sum()

In [None]:
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
import string 
punc = set(string.punctuation)

#loading stop_words
nltk.download('stopwords')

# creating a set of stop words
stop_words = set(stopwords.words('english'))

# combining the 2 sets with an "or" operator (i.e. "|")
all_stops = stop_words | punc

# loop to pre-process data
clean_desc =[]
for item in df['description'].to_list():
    tok_desc = word_tokenize(item)
    lower_data = [i.lower() for i in tok_desc]
    tok_desc_no_num = [i for i in lower_data if i.isalpha()]
    filtered_desc = [i for i in tok_desc_no_num if i not in all_stops]
    clean_desc.append(filtered_desc)

In [None]:
clean_desc_untok = [' '.join(i) for i in clean_desc]
column_names = ['original_desc', 'untok_description']
data_tuple= list(zip(df['description'], clean_desc_untok))
desc_df = pd.DataFrame(data_tuple, columns=column_names)
nltk.download('movie_reviews')
tb = Blobber(analyzer=NaiveBayesAnalyzer())
blob = [tb(text) for text in desc_df['untok_description']]
sentiment_values = [text.sentiment for text in blob]
stats = pd.DataFrame(zip(*sentiment_values)).T
stats.columns = ['clf','pos','neg']

In [None]:
sns.countplot(stats['clf'])
df = df.join(stats)

In [None]:
X = df.drop(columns=['test','description','clf'])
y = df[['test']]

In [None]:
%timeit
# import datetime
# start = datetime.datetime.now()
def create_dummies_ohe(X,cat_columns):
    categorical_x = pd.get_dummies(X[cat_columns],prefix='cat')
    numeric_cols = list(set(X.columns)-set(cat_columns))
    return pd.concat([X[numeric_cols],categorical_x], axis=1)

X = create_dummies_ohe(X,['province','variety','country_iso','price_bin','winery','region_1'])

In [None]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [None]:
numeric_features = ['price','title_length','taster_name_length','avg_temp','year','pos','neg']
RB = RobustScaler() 
X_train[numeric_features] = RB.fit_transform(X_train[numeric_features])
X_test[numeric_features] = RB.fit_transform(X_test[numeric_features])
import warnings 
warnings.simplefilter('ignore')

In [None]:
from sklearn.metrics import f1_score

def f1(model,X_test,y_test):
    y_pred = model.predict(X_test)
    score = f1_score(y_test,y_pred,average='weighted')
    return score

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=500,n_jobs=-1)
rf_classifier.fit(X_train, y_train)
print(f1(rf_classifier,X_test,y_test))