SCIKIT LEARN - simple tool for data mining and data analysis
build on NumPy, SciPy and matplotlib


What can we achieve with sklearn? 
1. Classification / identifying with category an object belongs to (spam detection)
2. Regression / perdicting an attribute associated with an object (stock prices predictio, weather prediction)
3. CLustering / automatic grouping of similar objects into sets (customer segmentation)
4. Model Selection / compating, validating and choosing parameters and models (imporoving model accurancy via parameter tuning, compare different models)
5. Dimensionality reduction / reducing the number of random variables to consider (to increase model efficiency, less biased)
6. Preprocessing / feature extraction and normalization (transforming input data such as text for use with ML algos) 


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier #
from sklearn.svm import SVC #support vector classifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier #
# from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder # the most commonly used preprocessing - standardscaler
from sklearn.model_selection import train_test_split #for splitting data into different sections
%matplotlib inline 
#only for Jupiter notebook



In [2]:
wine = pd.read_csv('winequality-data.csv')
wine.tail()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,id
3913,6.5,0.28,0.38,7.8,0.031,54.0,216.0,0.99154,3.03,0.42,13.1,6,4894
3914,6.6,0.21,0.29,5.35,0.029,43.0,106.0,0.99112,2.93,0.43,11.5,7,4895
3915,8.6,0.18,0.28,0.8,0.032,25.0,78.0,0.99104,2.99,0.38,11.1,5,4896
3916,6.4,0.22,0.38,9.1,0.044,35.0,127.0,0.99326,2.97,0.3,11.0,7,4897
3917,7.2,0.46,0.65,10.4,0.05,76.0,192.0,0.9976,3.16,0.42,8.7,5,4898


In [3]:
wine.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,id
count,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0,3918.0
mean,6.85462,0.278027,0.333201,6.405819,0.045566,35.326187,138.395227,0.994026,3.187989,0.489704,10.518765,5.877489,2939.5
std,0.841879,0.100459,0.121375,5.107972,0.021138,17.158043,42.99682,0.003025,0.150022,0.114186,1.237195,0.890364,1131.173506
min,3.8,0.08,0.0,0.6,0.009,3.0,9.0,0.98711,2.74,0.22,8.0,3.0,981.0
25%,6.3,0.21,0.27,1.7,0.036,23.0,107.25,0.9917,3.09,0.41,9.5,5.0,1960.25
50%,6.8,0.26,0.31,5.2,0.043,34.0,134.0,0.99373,3.18,0.47,10.4,6.0,2939.5
75%,7.3,0.32,0.38,9.9,0.05,46.0,168.0,0.99616,3.28,0.55,11.4,6.0,3918.75
max,14.2,1.1,1.66,65.8,0.346,289.0,440.0,1.03898,3.8,1.08,14.2,9.0,4898.0


In [4]:
wine.dtypes

fixed.acidity           float64
volatile.acidity        float64
citric.acid             float64
residual.sugar          float64
chlorides               float64
free.sulfur.dioxide     float64
total.sulfur.dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
id                        int64
dtype: object

In [5]:
wine.columns.values

array(['fixed.acidity', 'volatile.acidity', 'citric.acid',
       'residual.sugar', 'chlorides', 'free.sulfur.dioxide',
       'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'id'], dtype=object)

In [6]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3918 entries, 0 to 3917
Data columns (total 13 columns):
fixed.acidity           3918 non-null float64
volatile.acidity        3918 non-null float64
citric.acid             3918 non-null float64
residual.sugar          3918 non-null float64
chlorides               3918 non-null float64
free.sulfur.dioxide     3918 non-null float64
total.sulfur.dioxide    3918 non-null float64
density                 3918 non-null float64
pH                      3918 non-null float64
sulphates               3918 non-null float64
alcohol                 3918 non-null float64
quality                 3918 non-null int64
id                      3918 non-null int64
dtypes: float64(11), int64(2)
memory usage: 398.0 KB


In [7]:
wine.isnull().head(5)

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,id
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
wine.isnull().values.any() # faster way

False

In [18]:
wine.isnull().sum()

fixed.acidity           0
volatile.acidity        0
citric.acid             0
residual.sugar          0
chlorides               0
free.sulfur.dioxide     0
total.sulfur.dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 5
id                      0
dtype: int64

In [10]:
#Preprocessing of the data
bins = (2,6.5,8) # quaility is between 2 and 8 (actually 3-9). 
# just 2 bins of quality / good and bad/ exact index of 8 - 0 to 8
# 6.5 - spread, we need a higher quality 
group_names= ['bad', 'good']
wine['quality']=pd.cut(wine['quality'], bins=bins, labels=group_names)
# cutting the quality and replacing it
wine['quality'].unique()

[bad, good, NaN]
Categories (2, object): [bad < good]

In [11]:
label_quality = LabelEncoder()

In [12]:
wine['quality']=label_quality.fit_transform(wine['quality'])

TypeError: '<' not supported between instances of 'str' and 'float'

In [13]:
wine['quality'].value_counts()

bad     3066
good     847
Name: quality, dtype: int64

In [16]:
wine['quality'].isnull().sum()

5

In [17]:
wine.count()

fixed.acidity           3918
volatile.acidity        3918
citric.acid             3918
residual.sugar          3918
chlorides               3918
free.sulfur.dioxide     3918
total.sulfur.dioxide    3918
density                 3918
pH                      3918
sulphates               3918
alcohol                 3918
quality                 3913
id                      3918
dtype: int64

In [23]:
wine['quality'].dropna().head()

0     bad
1    good
2     bad
3    good
4     bad
Name: quality, dtype: category
Categories (2, object): [bad < good]

In [27]:
wine['quality'].fillna(wine.mean())

0        bad
1       good
2        bad
3       good
4        bad
5        bad
6        bad
7        bad
8        bad
9       good
10       bad
11       bad
12       bad
13       bad
14       bad
15       bad
16       bad
17       bad
18       bad
19       bad
20       bad
21       bad
22       bad
23       bad
24       bad
25       bad
26       bad
27       bad
28       bad
29      good
        ... 
3888     bad
3889     bad
3890    good
3891     bad
3892     bad
3893     bad
3894     bad
3895     bad
3896     bad
3897     bad
3898    good
3899     bad
3900     bad
3901    good
3902    good
3903     bad
3904     bad
3905    good
3906     bad
3907     bad
3908     bad
3909    good
3910     bad
3911     bad
3912     bad
3913     bad
3914    good
3915     bad
3916    good
3917     bad
Name: quality, Length: 3918, dtype: category
Categories (2, object): [bad < good]

In [28]:
wine['quality'].isnull().sum()

5