# Importing some libraries for EDA

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Import warning library to ignore the warnings
import warnings
warnings.simplefilter("ignore")

# Loading the Wine Dataset

In [None]:
df=pd.read_csv('winedataset.csv')
df.head() # Display first 5 columns

Looking at the initial 5 rows of data, it can be infered that the dataset is about the amount of ingredients which goes into the preparation of wine. As per the amount of different ingredients, the class of the wine is defined. Lets us look further into this dataset to set our Target variable.

In [None]:
# Lets look at the datatypes of different columns
df.info()

In [None]:
# Statistical Summary
df.describe()

1. There are a total of 14 columns.
2. We can see that there are no missing values in this dataset and total number of records is 178.
3. 'Class' : categorical variable/int64 with values 1, 2 and 3.
4. 'Alcohal' : Continous variable/float64. Mean and Median are almost same, so no outliers.
5. 'Malic acid' : Continous variable/float64. Difference between Mean and Median are under accetable range. So no ouliers.
6. 'Ash' : Continous variable/float64. Mean and Median are very close, so no outliers.
7. 'Alcalinity of ash' : Continous variable/float64. Mean and Median are very close, so no outliers.
8. 'Magnesium' : Continous variable/int64. Mean and Median are very close, so no outliers.
9. 'Total phenols' : Continous variable/float64. Mean and Median are very close, so no outliers.
10. 'Flavanoids' : Continous variable/float64. Mean and Median are very close, so no outliers.
11. 'Nonflavanoid phenols' : Continous variable/float64. Mean and Median are very close, so no outliers.
12. 'Proanthocyanins' : Continous variable/float64. Mean and Median are very close, so no outliers.
13. 'Color intensity' : Continous variable/float64. Mean and Median are very close, so no outliers.
14. 'Hue' : Continous variable/float64. Mean and Median are very close, so no outliers.
15. 'diluted wines' : Continous variable/float64. Mean and Median are very close, so no outliers.
16. 'Proline' : Continous variable/int64. Mean and Median are very close, so no outliers.

# Let's see the Heatmap for the correlation between the variables

In [None]:
corr = df.corr()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1,figsize=(18, 10))
sns.heatmap(corr, annot=True)
plt.show()

For the objective of this project, lets select Class as the Target Variable. 
We can drop the column 'Ash' since it has very less correlation value(-0.05) with 'Class'. We will keep all the rest columns.

In [None]:
df.drop('Ash', axis=1, inplace=True)
df.head(2)

# Data Visualization

In [None]:
sns.barplot(x=df['Class'], y=df['Alcohol'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Malic acid'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Alcalinity of ash'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Magnesium'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Total phenols'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Flavanoids'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Nonflavanoid phenols'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Proanthocyanins'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Color intensity'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Hue'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['diluted wines'])
plt.show()

In [None]:
sns.barplot(x=df['Class'], y=df['Proline    '])
plt.show()

# Spliting the Dataset into x and y

In [None]:
x = df.drop(['Class'],axis=1)
y = pd.DataFrame(df['Class'])

In [None]:
x.head(2)

In [None]:
y.head(2)

# Finding Best random state

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lg=LogisticRegression()
rand_state = 0
accu = 0
for i in (30,200):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=i)
    lg=LogisticRegression()
    lg.fit(x_train,y_train)
    y_pred = lg.predict(x_test)
    tempaccu=accuracy_score(y_test,y_pred)
    if tempaccu > accu:
        accu = tempaccu
        rand_state = i
        
print(f"Best Accuracy {accu*100} found on Random state {rand_state}")

We found the best random state at 30 and will be using it in train_test_split in next step

In [None]:
# creating train test split using the best random state which we found above
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=30)

# Importing Classification libraries for model building

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
algo = [LogisticRegression,RandomForestClassifier,SGDClassifier,DecisionTreeClassifier,
        KNeighborsClassifier,GaussianNB]

for each in algo:
    model = each()
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(f"{model}\n",classification_report(y_test, y_pred))
    print("\n")

LogisticRegression(), RandomForestClassifier() and GaussianNB() gave us a good accuracy. Hence we can use any of these models for the wine classification problem.