In [1]:
import pandas as pd

We start off with Pandas Series, which are one dimensional array objects that can hold objects of varying data types. We, then will move onto pandas Dataframes, where we will cover loading in CSVs and all the other shit you do.

In [14]:
groceries = pd.Series(data = [30, 6, 'Yes', 'No'], index = ['eggs', 'apples', 'milk', 'bread'])
groceries

eggs       30
apples      6
milk      Yes
bread      No
dtype: object

In [15]:
print('Groceries has shape:', groceries.shape)
print('Groceries has dimension:', groceries.ndim)
print('Groceries has a total of', groceries.size, 'elements')

Groceries has shape: (4,)
Groceries has dimension: 1
Groceries has a total of 4 elements


In [16]:
print('The data in Groceries is:', groceries.values)
print('The index of Groceries is:', groceries.index)

The data in Groceries is: [30 6 'Yes' 'No']
The index of Groceries is: Index(['eggs', 'apples', 'milk', 'bread'], dtype='object')


In [17]:
print('How many eggs do we need to buy:', groceries['eggs'])

How many eggs do we need to buy: 30


In [18]:
print('Do we need milk and bread:\n', groceries[['milk', 'bread']]) 

Do we need milk and bread:
 milk     Yes
bread     No
dtype: object


Pandas Series have two attributes, .loc and .iloc to explicitly state what we mean. The attribute .loc stands for location and it is used to explicitly state that we are using a labeled index. Similarly, the attribute .iloc stands for integer location and it is used to explicitly state that we are using a numerical index. Let's see some examples:

In [19]:
print('How many eggs and apples do we need to buy:\n', groceries.loc[['eggs', 'apples']]) 

How many eggs and apples do we need to buy:
 eggs      30
apples     6
dtype: object


In [20]:
print('How many eggs and apples do we need to buy:\n',  groceries[[0, 1]]) 


How many eggs and apples do we need to buy:
 eggs      30
apples     6
dtype: object


In [21]:
print('Do we need bread:\n', groceries[[-1]]) 

Do we need bread:
 bread    No
dtype: object


In [22]:
print('Do we need milk and bread:\n', groceries.iloc[[2, 3]]) 


Do we need milk and bread:
 milk     Yes
bread     No
dtype: object


Pandas Series are also mutable like NumPy ndarrays, which means we can change the elements of a Pandas Series after it has been created.

In [23]:
print('Original Grocery List:\n', groceries)

Original Grocery List:
 eggs       30
apples      6
milk      Yes
bread      No
dtype: object


In [25]:
groceries['eggs'] = 2
print('Modified Grocery List:\n', groceries)

Modified Grocery List:
 eggs        2
apples      6
milk      Yes
bread      No
dtype: object


We can also delete items from a Pandas Series by using the .drop() method. The Series.drop(label) method removes the given label from the given Series.

In [28]:
groceries.drop('apples')

eggs       2
milk     Yes
bread     No
dtype: object

In [29]:
print(groceries) #Why is apples still there? It turns out when you drop, the original series is unaffected.
#to change the original series, set inplace = True

eggs        2
apples      6
milk      Yes
bread      No
dtype: object


In [None]:
groceries.drop('apples', inplace = True)

In [36]:
print(groceries)

eggs       2
milk     Yes
bread     No
dtype: object


## Arithmetic Ops ##

In [42]:
grocery_list = pd.Series(data = [30, 6, 5, 10], index = ['eggs', 'apples', 'milk', 'bread'])
grocery_list

eggs      30
apples     6
milk       5
bread     10
dtype: int64

In [46]:
#add 2 more of each to grocery_list
print(grocery_list + 2)
#multiply by 2 
print("\n Multiplied List\n",grocery_list * 2) 

eggs      32
apples     8
milk       7
bread     12
dtype: int64

 Multiplied List
 eggs      60
apples    12
milk      10
bread     20
dtype: int64


We can do the same math operations we did in numpy.

In [47]:
import numpy as np
np.power(grocery_list,2)

eggs      900
apples     36
milk       25
bread     100
dtype: int64

We can apply operations to specific elements


In [51]:
print('Original amount', grocery_list['eggs'],'\n')
print('New amount: ',grocery_list['eggs'] + 2)

Original amount 30 

New amount:  32


In [56]:
print('Amount of apples - 2 = ', groceries.iloc[1] - 2)
print('We halve the amount of eggs and apples:\n', groceries.loc[['eggs', 'apples']] / 2)


Amount of apples - 2 =  4
We halve the amount of eggs and apples:
 eggs      15
apples     3
dtype: object


In [57]:
wine_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep=';')
wine_df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [60]:
print(wine_df.shape)
print(wine_df.ndim)
print(wine_df.size)
print(wine_df.values)
print(wine_df.index)
print(wine_df.columns)

(1599, 12)
2
19188
[[ 7.4    0.7    0.    ...  0.56   9.4    5.   ]
 [ 7.8    0.88   0.    ...  0.68   9.8    5.   ]
 [ 7.8    0.76   0.04  ...  0.65   9.8    5.   ]
 ...
 [ 6.3    0.51   0.13  ...  0.75  11.     6.   ]
 [ 5.9    0.645  0.12  ...  0.71  10.2    5.   ]
 [ 6.     0.31   0.47  ...  0.66  11.     6.   ]]
RangeIndex(start=0, stop=1599, step=1)
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [8]:
wine_df['fixed acidity'].head()

0     7.4
1     7.8
2     7.8
3    11.2
4     7.4
Name: fixed acidity, dtype: float64

It is important to know that when accessing individual elements in a DataFrame, the labels should always be provided with the column label first, i.e. in the form dataframe[column][row]. 

In [67]:
wine_df['fixed acidity'][2]

1599

In [70]:
#we can modify data 
wine_df['fixed acidity'][2]=9.8 
wine_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,9.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [72]:
#create a new column. Automatically added to end of the dataframe. 
wine_df['ratings'] = wine_df['residual sugar'] + wine_df['citric acid']
wine_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,ratings
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1.90
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,2.60
2,9.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2.34
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,2.46
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,2.08
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,2.30
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,2.43
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,2.12


In [77]:
#Suppose you want to add a new row, perhaps a new wine
arbor_gold = [{'fixed acidity': 9.1, 'volatile acidity': 0.87, 'citric acid':0.54, 
               'residual sugar': 1.5, 'chlorides': 0.082, 'sulfur dioxide': 12.25, 
              'sulfur dioxide': 44, 'density': 3.26, 'pH':10.21, 'sulphates':0.56, 'alcohol':10.11,
              'quality': 7, 'ratings': 8.2}]
new_df = pd.DataFrame(arbor_gold, index=[1600])
new_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,sulfur dioxide,density,pH,sulphates,alcohol,quality,ratings
1600,9.1,0.87,0.54,1.5,0.082,44,3.26,10.21,0.56,10.11,7,8.2


In [86]:
wine_df = wine_df.append(new_df, sort = "True")
wine_df
#Notice that by appending a new row to the DataFrame, 
#the columns have been put in alphabetical order.

#also i hit append multiple times, sorry


Unnamed: 0,alcohol,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity
0,9.40,0.076,0.00,0.9978,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.70
1,9.80,0.098,0.00,0.9968,7.8,25.0,3.20,5,2.60,2.6,,0.68,67.0,0.88
2,9.80,0.092,0.04,0.9970,9.8,15.0,3.26,5,2.34,2.3,,0.65,54.0,0.76
3,9.80,0.075,0.56,0.9980,11.2,17.0,3.16,6,2.46,1.9,,0.58,60.0,0.28
4,9.40,0.076,0.00,0.9978,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1600,10.11,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87
1600,10.11,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87
1600,10.11,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87
1600,10.11,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87


In [89]:
#let's delete a column
wine_df.pop('alcohol') #pop only for deleting columns

In [90]:
wine_df

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity
0,0.076,0.00,0.9978,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.70
1,0.098,0.00,0.9968,7.8,25.0,3.20,5,2.60,2.6,,0.68,67.0,0.88
2,0.092,0.04,0.9970,9.8,15.0,3.26,5,2.34,2.3,,0.65,54.0,0.76
3,0.075,0.56,0.9980,11.2,17.0,3.16,6,2.46,1.9,,0.58,60.0,0.28
4,0.076,0.00,0.9978,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1600,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87
1600,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87
1600,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87
1600,0.082,0.54,3.2600,9.1,,10.21,7,8.20,1.5,44.0,0.56,,0.87


In [92]:
#delete a row, use drop
wine_df = wine_df.drop([1600], axis=0)

In [93]:
wine_df

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity
0,0.076,0.00,0.99780,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.700
1,0.098,0.00,0.99680,7.8,25.0,3.20,5,2.60,2.6,,0.68,67.0,0.880
2,0.092,0.04,0.99700,9.8,15.0,3.26,5,2.34,2.3,,0.65,54.0,0.760
3,0.075,0.56,0.99800,11.2,17.0,3.16,6,2.46,1.9,,0.58,60.0,0.280
4,0.076,0.00,0.99780,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.700
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,0.090,0.08,0.99490,6.2,32.0,3.45,5,2.08,2.0,,0.58,44.0,0.600
1595,0.062,0.10,0.99512,5.9,39.0,3.52,6,2.30,2.2,,0.76,51.0,0.550
1596,0.076,0.13,0.99574,6.3,29.0,3.42,6,2.43,2.3,,0.75,40.0,0.510
1597,0.075,0.12,0.99547,5.9,32.0,3.57,5,2.12,2.0,,0.71,44.0,0.645


Lets calculate how many NaN I have. We use isnull(). 
In order to count the total number of logical True values we use the .sum() method twice. We have to use it twice because the first sum returns a Pandas Series with the sums of logical True values along columns, as we see below:
<br>
If isnull is True, that has numerical value 1 and 0 if false

In [98]:
wine_df.isnull().sum() #returns a pandaseries

chlorides                  0
citric acid                0
density                    0
fixed acidity              0
free sulfur dioxide        0
pH                         0
quality                    0
ratings                    0
residual sugar             0
sulfur dioxide          1599
sulphates                  0
total sulfur dioxide       0
volatile acidity           0
dtype: int64

In [96]:
null = wine_df.isnull().sum().sum()
print(null) #matches with the sulfur dioxide column

1599


In [97]:
wine_df.isnull() #True -> null 

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity
0,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,False,False,False,False,False,False,False,False,False,True,False,False,False
1595,False,False,False,False,False,False,False,False,False,True,False,False,False
1596,False,False,False,False,False,False,False,False,False,True,False,False,False
1597,False,False,False,False,False,False,False,False,False,True,False,False,False


In [104]:
#We can count the number of non-NaN values by using .count()
wine_df.count() #returns a pandas series

chlorides               1599
citric acid             1599
density                 1599
fixed acidity           1599
free sulfur dioxide     1599
pH                      1599
quality                 1599
ratings                 1599
residual sugar          1599
sulfur dioxide             0
sulphates               1599
total sulfur dioxide    1599
volatile acidity        1599
dtype: int64

In [105]:
#let's drop rows with null values
wine_df.dropna(axis = 0) #rows

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity


In [107]:
wine_df

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity
0,0.076,0.00,0.99780,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.700
1,0.098,0.00,0.99680,7.8,25.0,3.20,5,2.60,2.6,,0.68,67.0,0.880
2,0.092,0.04,0.99700,9.8,15.0,3.26,5,2.34,2.3,,0.65,54.0,0.760
3,0.075,0.56,0.99800,11.2,17.0,3.16,6,2.46,1.9,,0.58,60.0,0.280
4,0.076,0.00,0.99780,7.4,11.0,3.51,5,1.90,1.9,,0.56,34.0,0.700
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,0.090,0.08,0.99490,6.2,32.0,3.45,5,2.08,2.0,,0.58,44.0,0.600
1595,0.062,0.10,0.99512,5.9,39.0,3.52,6,2.30,2.2,,0.76,51.0,0.550
1596,0.076,0.13,0.99574,6.3,29.0,3.42,6,2.43,2.3,,0.75,40.0,0.510
1597,0.075,0.12,0.99547,5.9,32.0,3.57,5,2.12,2.0,,0.71,44.0,0.645


In [109]:
wine_df.dropna(axis = 1) #drop any column with NaN

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulphates,total sulfur dioxide,volatile acidity
0,0.076,0.00,0.99780,7.4,11.0,3.51,5,1.90,1.9,0.56,34.0,0.700
1,0.098,0.00,0.99680,7.8,25.0,3.20,5,2.60,2.6,0.68,67.0,0.880
2,0.092,0.04,0.99700,9.8,15.0,3.26,5,2.34,2.3,0.65,54.0,0.760
3,0.075,0.56,0.99800,11.2,17.0,3.16,6,2.46,1.9,0.58,60.0,0.280
4,0.076,0.00,0.99780,7.4,11.0,3.51,5,1.90,1.9,0.56,34.0,0.700
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,0.090,0.08,0.99490,6.2,32.0,3.45,5,2.08,2.0,0.58,44.0,0.600
1595,0.062,0.10,0.99512,5.9,39.0,3.52,6,2.30,2.2,0.76,51.0,0.550
1596,0.076,0.13,0.99574,6.3,29.0,3.42,6,2.43,2.3,0.75,40.0,0.510
1597,0.075,0.12,0.99547,5.9,32.0,3.57,5,2.12,2.0,0.71,44.0,0.645


In [110]:
# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]

# We create a DataFrame  and provide the row index
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2', 'store 3'])

# We display the DataFrame
store_items

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


Let's fill in the NaN values. One way is to use fillna. This is known as forward filling. When replacing NaN values with forward filling, we can use previous values taken from columns or rows. The .fillna(method = 'ffill', axis) will use the forward filling (ffill) method to replace NaN values using the previous known value along the given axis. Let's see some examples:

In [111]:
store_items.fillna(method = 'ffill', axis = 0) 

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,2.0,10,7.0,4.0


In [114]:
store_items #for comparison

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


In [115]:
store_items.fillna(method = 'ffill', axis = 1) #fill in value with previous row value

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20.0,30.0,35.0,15.0,8.0,45.0,45.0
store 2,15.0,5.0,10.0,2.0,5.0,7.0,50.0
store 3,20.0,30.0,35.0,35.0,10.0,10.0,4.0


Similarly, you can choose to replace the NaN values with the values that go after them in the DataFrame, this is known as backward filling. The .fillna(method = 'backfill', axis) will use the backward filling (backfill) method to replace NaN values using the next known value along the given axis. Just like with forward filling we can choose to use row or column values. Let's see some examples:

In [116]:
store_items.fillna(method = 'backfill', axis = 0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,50.0
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,,10,,4.0


Notice that the NaN value in store 1 has been replaced with the next value in its column. However, notice that the two NaN values in store 3 didn't get replaced. That's because there are no next values in these columns, since these NaN values are the last values in those columns.


In [119]:
# We replace NaN values by using linear interpolation using column values
store_items.interpolate(method = 'linear', axis = 0)

Unnamed: 0,bikes,pants,watches,shirts,shoes,suits,glasses
store 1,20,30,35,15.0,8,45.0,
store 2,15,5,10,2.0,5,7.0,50.0
store 3,20,30,35,2.0,10,7.0,4.0


In [121]:
#Let's go back to the wine dataset
wine_df.isnull().any() #checks if we have any NaN values in our dataset

chlorides               False
citric acid             False
density                 False
fixed acidity           False
free sulfur dioxide     False
pH                      False
quality                 False
ratings                 False
residual sugar          False
sulfur dioxide           True
sulphates               False
total sulfur dioxide    False
volatile acidity        False
dtype: bool

In [122]:
wine_df.describe() #gives summary statistics

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,0.0,1599.0,1599.0,1599.0
mean,0.087467,0.270976,0.996747,8.320888,15.874922,3.311113,5.636023,2.809781,2.538806,,0.658149,46.467792,0.527821
std,0.047065,0.194801,0.001887,1.741441,10.460157,0.154386,0.807569,1.450763,1.409928,,0.169507,32.895324,0.17906
min,0.012,0.0,0.99007,4.6,1.0,2.74,3.0,1.2,0.9,,0.33,6.0,0.12
25%,0.07,0.09,0.9956,7.1,7.0,3.21,5.0,2.12,1.9,,0.55,22.0,0.39
50%,0.079,0.26,0.99675,7.9,14.0,3.31,6.0,2.45,2.2,,0.62,38.0,0.52
75%,0.09,0.42,0.997835,9.2,21.0,3.4,6.0,2.94,2.6,,0.73,62.0,0.64
max,0.611,1.0,1.00369,15.9,72.0,4.01,8.0,15.89,15.5,,2.0,289.0,1.58


In [123]:
#can apply describe just for one column
wine_df['chlorides'].describe()

count    1599.000000
mean        0.087467
std         0.047065
min         0.012000
25%         0.070000
50%         0.079000
75%         0.090000
max         0.611000
Name: chlorides, dtype: float64

In [124]:
print('Maximum values of each column:\n', wine_df.max())

Maximum values of each column:
 chlorides                 0.61100
citric acid               1.00000
density                   1.00369
fixed acidity            15.90000
free sulfur dioxide      72.00000
pH                        4.01000
quality                   8.00000
ratings                  15.89000
residual sugar           15.50000
sulfur dioxide                NaN
sulphates                 2.00000
total sulfur dioxide    289.00000
volatile acidity          1.58000
dtype: float64


In [125]:
print('Average value of each column:\n', wine_df.mean())


Average value of each column:
 chlorides                0.087467
citric acid              0.270976
density                  0.996747
fixed acidity            8.320888
free sulfur dioxide     15.874922
pH                       3.311113
quality                  5.636023
ratings                  2.809781
residual sugar           2.538806
sulfur dioxide                NaN
sulphates                0.658149
total sulfur dioxide    46.467792
volatile acidity         0.527821
dtype: float64


In [126]:
wine_df.corr()

Unnamed: 0,chlorides,citric acid,density,fixed acidity,free sulfur dioxide,pH,quality,ratings,residual sugar,sulfur dioxide,sulphates,total sulfur dioxide,volatile acidity
chlorides,1.0,0.203823,0.200632,0.093756,0.005562,-0.265026,-0.128907,0.081413,0.05561,,0.37126,0.0474,0.061298
citric acid,0.203823,1.0,0.364947,0.670718,-0.060978,-0.541904,0.226373,0.273811,0.143577,,0.31277,0.035533,-0.552496
density,0.200632,0.364947,1.0,0.668011,-0.021946,-0.341699,-0.174919,0.394286,0.355283,,0.148506,0.071269,0.022026
fixed acidity,0.093756,0.670718,0.668011,1.0,-0.153824,-0.683081,0.123461,0.201466,0.114632,,0.182935,-0.112994,-0.255148
free sulfur dioxide,0.005562,-0.060978,-0.021946,-0.153824,1.0,0.070377,-0.050656,0.173596,0.187049,,0.051658,0.667666,-0.010504
pH,-0.265026,-0.541904,-0.341699,-0.683081,0.070377,1.0,-0.057731,-0.156006,-0.085652,,-0.196648,-0.066495,0.234937
quality,-0.128907,0.226373,-0.174919,0.123461,-0.050656,-0.057731,1.0,0.043741,0.013732,,0.251397,-0.1851,-0.390558
ratings,0.081413,0.273811,0.394286,0.201466,0.173596,-0.156006,0.043741,1.0,0.991132,,0.047369,0.202084,-0.072322
residual sugar,0.05561,0.143577,0.355283,0.114632,0.187049,-0.085652,0.013732,0.991132,1.0,,0.005527,0.203028,0.001918
sulfur dioxide,,,,,,,,,,,,,


# Groupby #

You call .groupby() and pass the name of the column you want to group on, which is "residual sugar". Then, you use ["pH"] to specify the columns on which you want to perform the actual aggregation.


In [152]:
#let's group the residual sugar levels with pH
wine_df.groupby(['residual sugar'])['pH'].count()

residual sugar
0.9      2
1.2      8
1.3      5
1.4     35
1.5     30
        ..
13.4     1
13.8     2
13.9     1
15.4     2
15.5     1
Name: pH, Length: 91, dtype: int64