In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Reading in data

Let's read the iris dataset in. When we do so, we notice that the first row appears in the header of the data frame. This happens because by default a parameter of pd.read_csv() is pd.read_csv(header=True).

In [2]:
flowers = pd.read_csv("../Week2/iris.data.csv")
flowers.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


Original Comma Seperated File (.csv)
<img src="./flowers_without_header.png" alt="Drawing" style="width: 300px"/>

Added header to Comma Seperated File (.csv)
<img src="./flowers_with_header.png" alt="Drawing" style="width: 400px;"/>


In [3]:
flowers = pd.read_csv("../Data/iris_data.csv")

In [4]:
flowers.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
flowers['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [6]:
flowers.describe()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
flowers.isnull().sum()

Sepal Length    0
Sepal Width     0
Petal Length    0
Petal Width     0
Species         0
dtype: int64

# Loops

## There are 3 main types of loops:
- While
- For in
- List Comprehensions

### While

Most flexible, but slowest form of loop

In [8]:
counter = 0
while counter < 10:
    print(counter)
    counter = counter + 1

0
1
2
3
4
5
6
7
8
9


In [9]:
%%timeit
counter = 0
while counter < 10:
    counter = counter + 1

The slowest run took 213.38 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 585 ns per loop


### For Loops
Quite pythonic(effective use/style of python) for most applications and is faster than while loops.

In [10]:
for num in range(0,10):
    print(num)

0
1
2
3
4
5
6
7
8
9


In [11]:
%%timeit
for num in range(0,10):
    pass

The slowest run took 11.67 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 528 ns per loop


### List Comprehensions
Least flexible, but rediculously fast. Highly suggest trying to use these when you can.

In [12]:
[x for x in range(0,10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [13]:
%%timeit
[x for x in range(0,10)]

The slowest run took 4.13 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 1.46 µs per loop


In [14]:
for x in flowers:
    print(flowers[x])

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
5      5.4
6      4.6
7      5.0
8      4.4
9      4.9
10     5.4
11     4.8
12     4.8
13     4.3
14     5.8
15     5.7
16     5.4
17     5.1
18     5.7
19     5.1
20     5.4
21     5.1
22     4.6
23     5.1
24     4.8
25     5.0
26     5.0
27     5.2
28     5.2
29     4.7
      ... 
120    6.9
121    5.6
122    7.7
123    6.3
124    6.7
125    7.2
126    6.2
127    6.1
128    6.4
129    7.2
130    7.4
131    7.9
132    6.4
133    6.3
134    6.1
135    7.7
136    6.3
137    6.4
138    6.0
139    6.9
140    6.7
141    6.9
142    5.8
143    6.8
144    6.7
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: Sepal Length, dtype: float64
0      3.5
1      3.0
2      3.2
3      3.1
4      3.6
5      3.9
6      3.4
7      3.4
8      2.9
9      3.1
10     3.7
11     3.4
12     3.0
13     3.0
14     4.0
15     4.4
16     3.9
17     3.5
18     3.8
19     3.8
20     3.4
21     3.7
22     3.6
23     3.3
24     3.4
25     3.0
26     3

In [15]:
for x in flowers.iterrows():
    print(x)

(0, Sepal Length            5.1
Sepal Width             3.5
Petal Length            1.4
Petal Width             0.2
Species         Iris-setosa
Name: 0, dtype: object)
(1, Sepal Length            4.9
Sepal Width               3
Petal Length            1.4
Petal Width             0.2
Species         Iris-setosa
Name: 1, dtype: object)
(2, Sepal Length            4.7
Sepal Width             3.2
Petal Length            1.3
Petal Width             0.2
Species         Iris-setosa
Name: 2, dtype: object)
(3, Sepal Length            4.6
Sepal Width             3.1
Petal Length            1.5
Petal Width             0.2
Species         Iris-setosa
Name: 3, dtype: object)
(4, Sepal Length              5
Sepal Width             3.6
Petal Length            1.4
Petal Width             0.2
Species         Iris-setosa
Name: 4, dtype: object)
(5, Sepal Length            5.4
Sepal Width             3.9
Petal Length            1.7
Petal Width             0.4
Species         Iris-setosa
Name: 5, dtype: 

# Functions

## How to see what function are available:

In [16]:
# If you want to read documentation for a function, use the ? magic command which is a shortcut for help()
?pd.DataFrame()

In [17]:
# or also
help(pd.DataFrame())

Help on DataFrame in module pandas.core.frame object:

class DataFrame(pandas.core.generic.NDFrame)
 |  Two-dimensional size-mutable, potentially heterogeneous tabular data
 |  structure with labeled axes (rows and columns). Arithmetic operations
 |  align on both row and column labels. Can be thought of as a dict-like
 |  container for Series objects. The primary pandas data structure
 |  
 |  Parameters
 |  ----------
 |  data : numpy ndarray (structured or homogeneous), dict, or DataFrame
 |      Dict can contain Series, arrays, constants, or list-like objects
 |  index : Index or array-like
 |      Index to use for resulting frame. Will default to np.arange(n) if
 |      no indexing information part of input data and no index provided
 |  columns : Index or array-like
 |      Column labels to use for resulting frame. Will default to
 |      np.arange(n) if no column labels are provided
 |  dtype : dtype, default None
 |      Data type to force, otherwise infer
 |  copy : boolean, d

In [18]:
# A shorthand way you can look into attributes is also dir. Sometimes this is available when help() isn't.
dir(pd.DataFrame())

['T',
 '_AXIS_ALIASES',
 '_AXIS_IALIASES',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_SLICEMAP',
 '__abs__',
 '__add__',
 '__and__',
 '__array__',
 '__array_wrap__',
 '__bool__',
 '__bytes__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__invert__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__le__',
 '__len__',
 '__lt__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__round__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__

In [20]:
# Tab Completion can also be very convienient. Press tab while typing in an incomplete statement
pd.

SyntaxError: invalid syntax (<ipython-input-20-29ec83c25537>, line 2)

## Creating Functions

### The 3 major components of functions:
- name (function name)
- parameters (inputs for function)
- return statement (outputs of function)

In [21]:
def contains_iris(row): # name(parameters)
    for item in row:
        if 'Iris' in item.__repr__():
            return True # return statement
    return False # return statement

In [22]:
flowers['Iris?'] = flowers.apply(lambda row: contains_iris(row), axis = 1)

In [23]:
flowers.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species,Iris?
0,5.1,3.5,1.4,0.2,Iris-setosa,True
1,4.9,3.0,1.4,0.2,Iris-setosa,True
2,4.7,3.2,1.3,0.2,Iris-setosa,True
3,4.6,3.1,1.5,0.2,Iris-setosa,True
4,5.0,3.6,1.4,0.2,Iris-setosa,True


In [24]:
flowers['otherIris?'] = flowers.apply(lambda row: 'Iris' in row['Species'], axis = 1)
flowers.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species,Iris?,otherIris?
0,5.1,3.5,1.4,0.2,Iris-setosa,True,True
1,4.9,3.0,1.4,0.2,Iris-setosa,True,True
2,4.7,3.2,1.3,0.2,Iris-setosa,True,True
3,4.6,3.1,1.5,0.2,Iris-setosa,True,True
4,5.0,3.6,1.4,0.2,Iris-setosa,True,True


In [25]:
flowers['Iris-flipping'] = flowers.apply(lambda row: not row['otherIris?'], axis = 1)
flowers.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species,Iris?,otherIris?,Iris-flipping
0,5.1,3.5,1.4,0.2,Iris-setosa,True,True,False
1,4.9,3.0,1.4,0.2,Iris-setosa,True,True,False
2,4.7,3.2,1.3,0.2,Iris-setosa,True,True,False
3,4.6,3.1,1.5,0.2,Iris-setosa,True,True,False
4,5.0,3.6,1.4,0.2,Iris-setosa,True,True,False


In [26]:
flowers['Iris-flipping'] = flowers.apply(lambda row: not row['Iris-flipping'], axis = 1)
flowers.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species,Iris?,otherIris?,Iris-flipping
0,5.1,3.5,1.4,0.2,Iris-setosa,True,True,True
1,4.9,3.0,1.4,0.2,Iris-setosa,True,True,True
2,4.7,3.2,1.3,0.2,Iris-setosa,True,True,True
3,4.6,3.1,1.5,0.2,Iris-setosa,True,True,True
4,5.0,3.6,1.4,0.2,Iris-setosa,True,True,True
