In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
df = pd.read_csv("wines_SPA.csv")

In [63]:
df.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7498 non-null   object 
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6955 non-null   object 
 9   body         6331 non-null   float64
 10  acidity      6331 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 644.7+ KB


In [65]:
df.describe()

Unnamed: 0,rating,num_reviews,price,body,acidity
count,7500.0,7500.0,7500.0,6331.0,6331.0
mean,4.254933,451.109067,60.095822,4.158427,2.946612
std,0.118029,723.001856,150.356676,0.583352,0.248202
min,4.2,25.0,4.99,2.0,1.0
25%,4.2,389.0,18.9,4.0,3.0
50%,4.2,404.0,28.53,4.0,3.0
75%,4.2,415.0,51.35,5.0,3.0
max,4.9,32624.0,3119.08,5.0,3.0


In [66]:
df.isnull().sum()

Unnamed: 0,0
winery,0
wine,0
year,2
rating,0
num_reviews,0
country,0
region,0
price,0
type,545
body,1169


In [67]:
missing_rows = df[df['year'].isnull()]
missing_rows

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
46,Vega Sicilia,Unico Reserva Especial Edicion,,4.7,12421,Espana,Ribera del Duero,423.5,Ribera Del Duero Red,5.0,3.0
851,La Unica,Fourth Edition,,4.4,131,Espana,Vino de Espana,40.0,Tempranillo,4.0,2.0


In [68]:
df['year'].unique()

array(['2013', '2018', '2009', '1999', '1996', '1998', '2010', '1995',
       '2015', '2011', '2016', '1970', '1946', '1962', '2019', '2004',
       'N.V.', '1931', '1979', '2005', '2020', '2014', '1985', '1929',
       '2007', '2012', '2017', '2008', nan, '2006', '2000', '2003',
       '2002', '1991', '1994', '1990', '1989', '1987', '1986', '1981',
       '2001', '1968', '1964', '1982', '1974', '1983', '1955', '1980',
       '1972', '1953', '1958', '1942', '1965', '1992', '1973', '1997',
       '1967', '1975', '1910', '1961', '1954', '1988', '1969', '1951',
       '1928', '1976', '1949', '2021', '1959', '1922', '1978', '1925'],
      dtype=object)

N.V. -> Non Vintage

In [69]:
df['year'] = df['year'].replace('N.V.', np.nan)

In [70]:
df['year'] = pd.to_numeric(df['year'], errors='coerce')

In [71]:
df['year'].describe()

Unnamed: 0,year
count,7210.0
mean,2013.495839
std,6.94045
min,1910.0
25%,2011.0
50%,2015.0
75%,2017.0
max,2021.0


In [72]:
df['year'].isna().sum()

np.int64(290)

In [73]:
df['year'] = df.groupby(['winery', 'wine', 'country', 'region', 'type', 'body', 'acidity'])['year'].transform(lambda x: x.fillna(x.median()))


In [74]:
df['year'].fillna(df['year'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['year'].fillna(df['year'].median(), inplace=True)


In [75]:
df['year'].isnull().sum()

np.int64(0)

Now to handle the missing values in type, body and acidity column, this is what I have done:

1. Created a temporary acidity column with the same values as in the acidity column and use this and a couple of other factors that might affect the body of the wine to fill in the missing values in the column.

2. Now that values are filled in the body tab I use these to fill in the acidity values

3. Now that i have both acidity and body I used these and the other necessary factors that I felt were responsible to affect the type of the wine.


In [77]:
df['temp_acidity'] = df['acidity']
df['temp_acidity'] = df.groupby('type')['temp_acidity'].transform(lambda x: x.fillna(x.median()))


In [78]:
df['temp_body'] = df['body']
df['temp_body'] = df.groupby(['region', 'country', 'type', 'temp_acidity'])['temp_body'].transform(lambda x: x.fillna(x.median()))


In [79]:
df['acidity'] = df.groupby(['region', 'country', 'type', 'temp_body'])['acidity'].transform(lambda x: x.fillna(x.median()))


In [80]:
df['body'] = df.groupby(['region', 'country', 'type', 'acidity'])['body'].transform(lambda x: x.fillna(x.median()))


In [81]:
df['type'] = df.groupby(['region', 'body', 'acidity'])['type'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else "Unknown"))


In [82]:
df.drop(columns=['temp_acidity', 'temp_body'], inplace=True)


In [83]:
df.isna().sum()

Unnamed: 0,0
winery,0
wine,0
year,0
rating,0
num_reviews,0
country,0
region,0
price,0
type,550
body,550


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7500 non-null   float64
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6950 non-null   object 
 9   body         6950 non-null   float64
 10  acidity      6950 non-null   float64
dtypes: float64(5), int64(1), object(5)
memory usage: 644.7+ KB


In [86]:
df['acidity'] = df.groupby('type')['acidity'].transform(lambda x: x.fillna(x.median()))
df['body'] = df.groupby('type')['body'].transform(lambda x: x.fillna(x.median()))


In [87]:
df.isna().sum()

Unnamed: 0,0
winery,0
wine,0
year,0
rating,0
num_reviews,0
country,0
region,0
price,0
type,550
body,550


In [88]:
df['acidity'].fillna(df['acidity'].median(), inplace=True)
df['body'].fillna(df['body'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['acidity'].fillna(df['acidity'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['body'].fillna(df['body'].median(), inplace=True)


In [91]:
df.isna().sum()

Unnamed: 0,0
winery,0
wine,0
year,0
rating,0
num_reviews,0
country,0
region,0
price,0
type,0
body,0


In [90]:
df['type'].fillna(df['type'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['type'].fillna(df['type'].mode()[0], inplace=True)


In [94]:
df.to_csv('cleaned_wine_data.csv', index=False)