In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
red_wine = pd.read_csv(Path('Resources/Red.csv'))
red_wine.head()

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,95.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,15.5,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,7.45,2015
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,8.72,2019
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,29.15,2016


In [3]:
for i in range(len(red_wine['Price'])):
    if red_wine['Price'][i] >= 38:
        red_wine.at[i,'Price'] = 1
    else:
        red_wine.at[i,'Price'] = 0

In [4]:
red_wine.dtypes

Name                object
Country             object
Region              object
Winery              object
Rating             float64
NumberOfRatings      int64
Price              float64
Year                object
dtype: object

In [5]:
red_wine = red_wine[red_wine.Year != "N.V."]

In [6]:
red_wine['Year'] = red_wine['Year'].astype(int)
red_wine.dtypes

Name                object
Country             object
Region              object
Winery              object
Rating             float64
NumberOfRatings      int64
Price              float64
Year                 int32
dtype: object

In [7]:
red_wine_LS = red_wine.copy()

In [8]:
red_wine_LS.head(3)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,1.0,2011
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,0.0,2017
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,0.0,2015


In [9]:
red_wine_LS['Age'] = 2020 - red_wine_LS['Year']

In [10]:
red_wine_LS.head(5)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,Age
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,1.0,2011,9
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,0.0,2017,3
2,Erta e China Rosso di Toscana 2015,Italy,Toscana,Renzo Masi,3.9,100,0.0,2015,5
3,Bardolino 2019,Italy,Bardolino,Cavalchina,3.5,100,0.0,2019,1
4,Ried Scheibner Pinot Noir 2016,Austria,Carnuntum,Markowitsch,3.9,100,0.0,2016,4


In [11]:
ratings_description = red_wine_LS['Rating'].describe()

ratings_description

count    8658.000000
mean        3.890148
std         0.308429
min         2.500000
25%         3.700000
50%         3.900000
75%         4.100000
max         4.800000
Name: Rating, dtype: float64

In [14]:
red_wine_LS['Price'] = red_wine_LS['Price'].astype(int)
red_wine_LS.dtypes

Name                object
Country             object
Region              object
Winery              object
Rating             float64
NumberOfRatings      int64
Price                int32
Year                 int32
Age                  int32
dtype: object

In [15]:
red_wine_LS.head(2)

Unnamed: 0,Name,Country,Region,Winery,Rating,NumberOfRatings,Price,Year,Age
0,Pomerol 2011,France,Pomerol,Château La Providence,4.2,100,0,2011,9
1,Lirac 2017,France,Lirac,Château Mont-Redon,4.3,100,0,2017,3


In [16]:
red_wine_LS.value_counts("Price")

Price
0    6516
1    2142
dtype: int64

In [17]:
y = red_wine_LS["Price"]

X = red_wine_LS.drop(columns=["Price", "Name", "Country", "Region", "Winery", "Year"])

In [18]:
X.head()

Unnamed: 0,Rating,NumberOfRatings,Age
0,4.2,100,9
1,4.3,100,3
2,3.9,100,5
3,3.5,100,1
4,3.9,100,4


In [19]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assigned a random_state of 1 to the function

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(6493, 3)

In [20]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                random_state=1)
classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [21]:
predictions = classifier.predict(X_test)

In [22]:
balanced_accuracy_score(y_test, predictions)

0.7789597134035164

In [23]:
c_matrix = confusion_matrix(y_test, predictions)
print(c_matrix)

[[1541   88]
 [ 208  328]]


In [24]:
class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      1629
           1       0.79      0.61      0.69       536

    accuracy                           0.86      2165
   macro avg       0.83      0.78      0.80      2165
weighted avg       0.86      0.86      0.86      2165



## KNN Model

In [25]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(y_test,y_pred))

Accuracy Score: 0.7750577367205542


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## SVC

In [26]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train,y_train)
pred_y = model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(y_test,pred_y))

Accuracy Score: 0.7524249422632795


## RandomOverSampler

In [29]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
resamp = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
# YOUR CODE HERE!
x_resamp, y_resamp = resamp.fit_resample(X_train, y_train)

In [30]:
# Count the distinct values of the resampled labels data
# YOUR CODE HERE!
y_resamp.value_counts()

0    4887
1    4887
Name: Price, dtype: int64

In [31]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
classifier = LogisticRegression(solver='lbfgs',
                                random_state=1)

# Fit the model using the resampled training data
# YOUR CODE HERE!
classifier.fit(x_resamp, y_resamp)

# Make a prediction using the testing data
# YOUR CODE HERE!
predictions = classifier.predict(X_test)

In [32]:
# Print the balanced_accuracy score of the model 
# YOUR CODE HERE!

resamp_ba = balanced_accuracy_score(y_test, predictions)
print(resamp_ba)

0.8403155722309263


In [33]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!
resamp_cm = confusion_matrix(y_test, predictions)
print(resamp_cm)

[[1361  268]
 [  83  453]]


In [34]:
# Print the classification report for the model
# YOUR CODE HERE!
resamp_cr = classification_report(y_test, predictions)
print(resamp_cr)

              precision    recall  f1-score   support

           0       0.94      0.84      0.89      1629
           1       0.63      0.85      0.72       536

    accuracy                           0.84      2165
   macro avg       0.79      0.84      0.80      2165
weighted avg       0.86      0.84      0.84      2165

