In [27]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval

from scipy.stats import shapiro
from scipy.stats import spearmanr

from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
%matplotlib notebook

In [2]:
file = '../data/spain-red'
df = pd.read_csv(file + '.csv', converters={'foods': literal_eval, 'highlights': literal_eval})

In [3]:
diacritics = {'&#237;': 'í', '&#243;': 'ó', '&#250;': 'ú', '&#241;': 'ñ', '&#232;': 'è', \
    '&#8364;': '€', '&#193;': 'Á', '&#192;': 'À', '&#233;': 'é', '&#224;': 'à', \
    '&#239;': 'ï', '&#231;': 'ç', '&#193;': 'Á', '&#252;': 'ü', '&#225;': 'á', '&#186;': 'º', \
    '&#244;': 'ô', '&#8217;': '’', '&#960;': 'π', '&#210;': 'Ò'}
df.replace(diacritics, regex=True, inplace=True)
df['winery'] = df['winery'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['region'] = df['region'].apply(lambda x: re.sub(r'((?<=[a-z_à-ÿ])[A-Z]|(?<!\A)[A-Z](?=[a-z_à-ÿ]))', r' \1', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r"(\S)\(", r'\1 (', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub('\(\s*(.*?)\s*\)', r'(\1)', x))
df['vintage'] = df['vintage'].apply(lambda x: re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", x).strip())
df['vintage'] = df['vintage'].replace(r'\s+', ' ', regex=True)

numeric_columns = ['year', 'rating', 'ratings_count', 'country_rank', 'region_rank', \
                   'winery_rank', 'global_rank', 'alcohol', 'body', 'acidity', 'price']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

df = df.drop(['drink_from', 'drink_until'], axis = 1)

In [4]:
df.describe()

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
count,2014.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,1656.0,1900.0,1900.0,2013.0
mean,2014.1286,4.147992,744.478433,3.93059,5.692117,15.629152,4.140803,14.081582,4.271579,2.963158,87.068867
std,6.252022,0.24401,1714.88318,3.456955,5.452801,12.844511,3.42667,1.634316,0.507897,0.188424,234.235746
min,1964.0,3.7,25.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,4.9
25%,2013.0,3.9,86.0,1.0,2.0,6.0,1.0,14.0,4.0,3.0,18.95
50%,2016.0,4.1,227.0,3.0,4.0,12.0,3.0,14.5,4.0,3.0,31.13
75%,2017.0,4.3,659.0,5.0,8.0,22.0,6.0,14.5,5.0,3.0,61.95
max,2020.0,4.9,32378.0,22.0,38.0,81.0,20.0,16.5,5.0,3.0,3404.94


In [23]:
stat, p = shapiro(df['rating'])
print('stat=%.3f, p=%.3f\n' % (stat, p))

stat=0.943, p=0.000



In [21]:
if p > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

Probably not Gaussian


In [26]:
df.corr(method='spearman')

Unnamed: 0,year,rating,ratings_count,country_rank,region_rank,winery_rank,global_rank,alcohol,body,acidity,price
year,1.0,-0.321934,0.014126,0.28021,0.241985,0.14913,0.290808,0.107441,0.11157,0.006951,-0.492367
rating,-0.321934,1.0,-0.133982,-0.839422,-0.698224,-0.316268,-0.854556,0.167142,0.130101,-0.027448,0.762198
ratings_count,0.014126,-0.133982,1.0,-0.288074,-0.206691,-0.088453,-0.260282,-0.044829,0.113782,0.026892,-0.178474
country_rank,0.28021,-0.839422,-0.288074,1.0,0.795295,0.391337,0.98549,-0.16198,-0.131004,0.02136,-0.598353
region_rank,0.241985,-0.698224,-0.206691,0.795295,1.0,0.547022,0.800327,-0.078282,-0.016344,0.060616,-0.447994
winery_rank,0.14913,-0.316268,-0.088453,0.391337,0.547022,1.0,0.393294,0.014065,0.041363,0.045056,-0.052677
global_rank,0.290808,-0.854556,-0.260282,0.98549,0.800327,0.393294,1.0,-0.163564,-0.130495,0.020221,-0.618869
alcohol,0.107441,0.167142,-0.044829,-0.16198,-0.078282,0.014065,-0.163564,1.0,0.199177,0.007721,0.095341
body,0.11157,0.130101,0.113782,-0.131004,-0.016344,0.041363,-0.130495,0.199177,1.0,0.112068,0.062939
acidity,0.006951,-0.027448,0.026892,0.02136,0.060616,0.045056,0.020221,0.007721,0.112068,1.0,-0.023272


In [32]:
# test correlation
corr, pvalue = spearmanr(df['rating'], df['price'], nan_policy='omit')
print('Correlation coefficient:', corr)
print('P-value:', pvalue)

Correlation coefficient: 0.762197851487197
P-value: 0.0


In [35]:
plot_color = '#ad1a33'

In [39]:
fig, ax = plt.subplots(figsize=(9,6))
rate_price = df[(df['price'] < 800) & (df['rating'] != None)]

x = rate_price['price']
y = rate_price['rating']

ax.scatter(x, y, color = plot_color, alpha=.7)

# z = np.polyfit(x, y, 1)
# p = np.poly1d(z)
# ax.plot(x, p(x),color='grey', linewidth=1, linestyle=':')

plt.xlabel('Price (euro)')
plt.ylabel('Rating')
plt.title('Price vs. Rating', fontsize = 20)
plt.savefig('images/price_vs_rating.png')

<IPython.core.display.Javascript object>