### Import Dependencies

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
import warnings
warnings.filterwarnings('ignore')

### Import Dataset

In [5]:
df = pd.read_csv('wines_SPA.csv')

### EDA

In [7]:
df = df.drop(columns=['winery','region','num_reviews', 'country'])

In [8]:
df = df.dropna()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6329 entries, 0 to 7499
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   wine     6329 non-null   object 
 1   year     6329 non-null   object 
 2   rating   6329 non-null   float64
 3   price    6329 non-null   float64
 4   type     6329 non-null   object 
 5   body     6329 non-null   float64
 6   acidity  6329 non-null   float64
dtypes: float64(4), object(3)
memory usage: 395.6+ KB


#### Label Encode

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()

In [13]:
le.fit(df['wine'])
df['wine'] = le.transform(df['wine'])

In [14]:
le.fit(df['type'])
df['type'] = le.transform(df['type'])

In [15]:
df.head()

Unnamed: 0,wine,year,rating,price,type,body,acidity
0,667,2013,4.9,995.0,19,5.0,3.0
1,714,2018,4.9,313.5,18,4.0,2.0
2,683,2009,4.8,324.95,11,5.0,3.0
3,683,1999,4.8,692.96,11,5.0,3.0
4,683,1996,4.8,778.06,11,5.0,3.0


### Split Data

In [17]:
X = df.drop(columns = ['type', 'year'])

In [18]:
y = df['type']

## Handle Class Imbalances

### Using Random UnderSampler

In [29]:
from imblearn.under_sampling import RandomUnderSampler

In [40]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_resampled_RUS, y_resampled_RUS = undersample.fit_resample(X, y)

### Using Random OverSampler

In [38]:
from imblearn.over_sampling import RandomOverSampler

In [42]:
undersample = RandomOverSampler(sampling_strategy='minority')
X_resampled_ROS, y_resampled_ROS = undersample.fit_resample(X, y)

### Using ADASYN

In [45]:
from imblearn.over_sampling import ADASYN

In [49]:
adasyn = ADASYN(sampling_strategy='minority')
X_resampled_ADASYN, y_resampled_ADASYN = undersample.fit_resample(X, y)

## Split Data Using Train Test Split and Model it

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

### For Random UnderSampler

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_RUS, y_resampled_RUS, test_size=0.2, random_state=42)

In [64]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [68]:
predictions = model.predict(X_test)
f1 = f1_score(y_test, predictions, average='weighted')
print(f"F1-score on test set: {f1}")

F1-score on test set: 0.9132512970183851


### For Random Oversampler

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_ROS, y_resampled_ROS, test_size=0.2, random_state=42)

In [78]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [79]:
predictions = model.predict(X_test)
f1 = f1_score(y_test, predictions, average='weighted')
print(f"F1-score on test set: {f1}")

F1-score on test set: 0.9442309934022071


### For ADASYN

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_ADASYN, y_resampled_ADASYN, test_size=0.2, random_state=42)

In [85]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [87]:
predictions = model.predict(X_test)
f1 = f1_score(y_test, predictions, average='weighted')
print(f"F1-score on test set: {f1}")

F1-score on test set: 0.9458917646233587
