In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.impute import SimpleImputer

In [98]:
wines_df=pd.read_csv('/content/wines_SPA.csv')

In [99]:
wines_df.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [100]:
wines_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7498 non-null   object 
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6955 non-null   object 
 9   body         6331 non-null   float64
 10  acidity      6331 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 644.7+ KB


In [101]:
numeric_cols=wines_df.select_dtypes(exclude="object").columns.values
numeric_cols

array(['rating', 'num_reviews', 'price', 'body', 'acidity'], dtype=object)

In [102]:
wines_df[numeric_cols].describe()

Unnamed: 0,rating,num_reviews,price,body,acidity
count,7500.0,7500.0,7500.0,6331.0,6331.0
mean,4.254933,451.109067,60.095822,4.158427,2.946612
std,0.118029,723.001856,150.356676,0.583352,0.248202
min,4.2,25.0,4.99,2.0,1.0
25%,4.2,389.0,18.9,4.0,3.0
50%,4.2,404.0,28.53,4.0,3.0
75%,4.2,415.0,51.35,5.0,3.0
max,4.9,32624.0,3119.08,5.0,3.0


In [103]:
Cleaned_wines_df=wines_df.copy()

In [104]:
#Converting the year columns datatype to int for easy manipulation and comparisons
Cleaned_wines_df['year']=pd.to_numeric(wines_df['year'],errors='coerce').astype('Int32')
Cleaned_wines_df['year'].dtype

Int32Dtype()

In [105]:
#Dropping the rows where value of row and type column is NaN
Cleaned_wines_df.dropna(subset=['year','type'],inplace=True)
print("Number of null values in column year ",Cleaned_wines_df.year.isna().sum())
print("Number of null values in column type ",Cleaned_wines_df['type'].isna().sum())


Number of null values in column year  0
Number of null values in column type  0


In [106]:
#updating numeric_cols as now year is also a numeric value
numeric_cols=Cleaned_wines_df.select_dtypes(exclude="object").columns.values
numeric_cols

array(['year', 'rating', 'num_reviews', 'price', 'body', 'acidity'],
      dtype=object)

In [107]:
Cleaned_wines_df[numeric_cols].describe()

Unnamed: 0,year,rating,num_reviews,price,body,acidity
count,6672.0,6672.0,6672.0,6672.0,6070.0,6070.0
mean,2013.023531,4.257584,454.21298,64.879163,4.163756,2.947117
std,6.944037,0.121152,631.810377,158.341489,0.593981,0.242883
min,1910.0,4.2,25.0,6.26,2.0,1.0
25%,2011.0,4.2,389.0,19.98,4.0,3.0
50%,2015.0,4.2,402.0,31.63,4.0,3.0
75%,2017.0,4.2,417.0,61.5725,5.0,3.0
max,2021.0,4.9,16505.0,3119.08,5.0,3.0


In [108]:
#Using imputer to fill replace null values in body and acidity with the means of their columns
imputer=SimpleImputer(strategy="mean")
imputer.fit(Cleaned_wines_df[['acidity','body']])
Cleaned_wines_df[['acidity','body']]=imputer.transform(Cleaned_wines_df[['acidity','body']])

In [109]:
print("Null count in price ",Cleaned_wines_df['acidity'].isna().sum())
print("Null count in body ",Cleaned_wines_df['body'].isna().sum())

Null count in price  0
Null count in body  0


In [110]:
Cleaned_wines_df.describe()

Unnamed: 0,year,rating,num_reviews,price,body,acidity
count,6672.0,6672.0,6672.0,6672.0,6672.0,6672.0
mean,2013.023531,4.257584,454.21298,64.879163,4.163756,2.947117
std,6.944037,0.121152,631.810377,158.341489,0.566547,0.231665
min,1910.0,4.2,25.0,6.26,2.0,1.0
25%,2011.0,4.2,389.0,19.98,4.0,3.0
50%,2015.0,4.2,402.0,31.63,4.0,3.0
75%,2017.0,4.2,417.0,61.5725,4.163756,3.0
max,2021.0,4.9,16505.0,3119.08,5.0,3.0


In [112]:
Cleaned_wines_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6672 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       6672 non-null   object 
 1   wine         6672 non-null   object 
 2   year         6672 non-null   Int32  
 3   rating       6672 non-null   float64
 4   num_reviews  6672 non-null   int64  
 5   country      6672 non-null   object 
 6   region       6672 non-null   object 
 7   price        6672 non-null   float64
 8   type         6672 non-null   object 
 9   body         6672 non-null   float64
 10  acidity      6672 non-null   float64
dtypes: Int32(1), float64(4), int64(1), object(5)
memory usage: 606.0+ KB
