# Importing Libraries

In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing Dataset

In [56]:
data = pd.read_csv('wines_SPA.csv')
data.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7498 non-null   object 
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6955 non-null   object 
 9   body         6331 non-null   float64
 10  acidity      6331 non-null   float64
dtypes: float64(4), int64(1), object(6)
memory usage: 644.7+ KB


In [58]:
data.isnull().sum()

winery            0
wine              0
year              2
rating            0
num_reviews       0
country           0
region            0
price             0
type            545
body           1169
acidity        1169
dtype: int64

# Data Cleaning

## The Data has Null values in Column : year,type,body,acidity

#### The column year contains various years but is stored in object datatype and conatins N.V null values i think it will be better to convert it into int datatype and fill its null values with the mean

In [59]:
data['year'] = pd.to_numeric(data['year'], errors='coerce').astype('Int64')


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7210 non-null   Int64  
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         6955 non-null   object 
 9   body         6331 non-null   float64
 10  acidity      6331 non-null   float64
dtypes: Int64(1), float64(4), int64(1), object(5)
memory usage: 652.0+ KB


#### there were just 2 null values as dtype object as it didnt considered N.V as null/missing value but after converting to dtype Int64 it was also considered as missing and now year column has 290 null values

In [61]:
data['year'] = data['year'].fillna(data['year'].mean().astype(int))
print("Null values left in column year : ", data['year'].isnull().sum())

Null values left in column year :  0


### For handling the null values in columns body and acidity i will fill those null values with the mean of respective columns

In [62]:
data['body'] = data['body'].fillna(data['body'].mean())
data['acidity'] = data['acidity'].fillna(data['acidity'].mean())
print("Null values left in column body : ", data['body'].isnull().sum())
print("Null values left in column acidity : ", data['acidity'].isnull().sum())

Null values left in column body :  0
Null values left in column acidity :  0


### for handling the missing values in column 'type', i will fill it with its mode 

In [63]:
data['type'] = data['type'].fillna(data['type'].mode()[0])
print("Null values left in column type : ", data['type'].isnull().sum())


Null values left in column type :  0


In [64]:
data.info()
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winery       7500 non-null   object 
 1   wine         7500 non-null   object 
 2   year         7500 non-null   Int64  
 3   rating       7500 non-null   float64
 4   num_reviews  7500 non-null   int64  
 5   country      7500 non-null   object 
 6   region       7500 non-null   object 
 7   price        7500 non-null   float64
 8   type         7500 non-null   object 
 9   body         7500 non-null   float64
 10  acidity      7500 non-null   float64
dtypes: Int64(1), float64(4), int64(1), object(5)
memory usage: 652.0+ KB


winery         0
wine           0
year           0
rating         0
num_reviews    0
country        0
region         0
price          0
type           0
body           0
acidity        0
dtype: int64

In [65]:
data.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [66]:
data.to_csv("Cleaned_wines_SPA.csv", index=False)