# Pandas Practice - Part 1

Use this notebook to follow along wit the tasks in the `AO3-Pandas_Part1.ipynb` notebook.

#### Task 1.1: Setup

Import `pandas` as `pd`.

In [1]:
# Numpy and pandas usage are often intertwined.
# These abbreviations are ubiquitiously used.
import numpy as np
import pandas as pd

#### Task 2a Create a `pd.Series` object
Use the practice notebook to create a series of your own design.

In [2]:
# Create the series object.
my_series = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
# Call the object in the notebook.
# This (mostly) calls the __repr__() function of a given object.
my_series

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

#### Task 2b: Create a pd.DataFrame object from a Python dictionary

In [4]:
# Creating a dataframe with a dictionary.
df_1 = pd.DataFrame(
    {'alpha': [0, 1, 2, 3, 4],
     'beta': ['a', 'b', 'c', 'd', 'e']})

In [5]:
# Call the object in the notebook.
# This time we get a html output, as the notebook shell
# sees a _repr_html_() function to call.
df_1

Unnamed: 0,alpha,beta
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [6]:
df = pd.DataFrame(np.random.randn(6, 4))

In [7]:
df

Unnamed: 0,0,1,2,3
0,-1.081732,0.585522,0.095659,1.231995
1,-0.8201,-0.694374,-1.047684,1.09149
2,-0.533068,1.115326,1.364141,-0.615102
3,-1.638654,0.541224,0.746881,-0.115536
4,0.032253,1.484968,0.481956,-0.044315
5,-0.214639,0.511551,-2.083994,-0.809873


#### Task 2c: Supply indexes and columns when creating a pd.DataFrame object.


In [8]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.287814,0.033583,0.276403,-0.01871
2013-01-02,1.905545,-0.300702,-2.157351,0.524162
2013-01-03,-1.814816,-1.991993,-0.309027,-2.722638
2013-01-04,0.476406,-0.253868,0.595994,0.266397
2013-01-05,0.011803,-0.066597,0.148938,0.048033
2013-01-06,-0.088147,-0.016642,1.149568,0.003246


#### Task 3a: Download the iris.csv file.
Download the iris data and take a moment to learn about this data from the Wikipedia page cited above

#### Task 3b: Import the iris.csv file
In your own practice notebook, import the iris dataset.

In [10]:
iris_df = pd.read_csv('data/iris.csv')

#### Task 3c: Import a tab delimited file
Take a look at the `pd.read_csv` online documentation. Describe how you would import a tab-delimited file.

In [11]:
pd.read_csv

<function pandas.io.parsers._make_parser_function.<locals>.parser_f(filepath_or_buffer, sep=',', delimiter=None, header='infer', names=None, index_col=None, usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=False, skip_blank_lines=True, parse_dates=False, infer_datetime_format=False, keep_date_col=False, date_parser=None, dayfirst=False, iterator=False, chunksize=None, compression='infer', thousands=None, decimal=b'.', lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, dialect=None, tupleize_cols=None, error_bad_lines=True, warn_bad_lines=True, skipfooter=0, doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None)>

#### Task 4a: Use head, tail and sample with the iris data frame.


In [12]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [13]:
iris_df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [14]:
iris_df.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [15]:
iris_df.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
110,6.5,3.2,5.1,2.0,virginica
26,5.0,3.4,1.6,0.4,setosa
59,5.2,2.7,3.9,1.4,versicolor
131,7.9,3.8,6.4,2.0,virginica
35,5.0,3.2,1.2,0.2,setosa


#### Task 5a: Display the columns and indexes of the iris data frame.

In [16]:
iris_df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [17]:
iris_df.index

RangeIndex(start=0, stop=150, step=1)

#### Task 5b: Check the version of `pandas` you have, then convert a data frame to a numpy array.



In [18]:
pd.__version__

'0.23.4'

In [19]:
iris_df.values

array([[5.1, 3.5, 1.4, 0.2, 'setosa'],
       [4.9, 3.0, 1.4, 0.2, 'setosa'],
       [4.7, 3.2, 1.3, 0.2, 'setosa'],
       [4.6, 3.1, 1.5, 0.2, 'setosa'],
       [5.0, 3.6, 1.4, 0.2, 'setosa'],
       [5.4, 3.9, 1.7, 0.4, 'setosa'],
       [4.6, 3.4, 1.4, 0.3, 'setosa'],
       [5.0, 3.4, 1.5, 0.2, 'setosa'],
       [4.4, 2.9, 1.4, 0.2, 'setosa'],
       [4.9, 3.1, 1.5, 0.1, 'setosa'],
       [5.4, 3.7, 1.5, 0.2, 'setosa'],
       [4.8, 3.4, 1.6, 0.2, 'setosa'],
       [4.8, 3.0, 1.4, 0.1, 'setosa'],
       [4.3, 3.0, 1.1, 0.1, 'setosa'],
       [5.8, 4.0, 1.2, 0.2, 'setosa'],
       [5.7, 4.4, 1.5, 0.4, 'setosa'],
       [5.4, 3.9, 1.3, 0.4, 'setosa'],
       [5.1, 3.5, 1.4, 0.3, 'setosa'],
       [5.7, 3.8, 1.7, 0.3, 'setosa'],
       [5.1, 3.8, 1.5, 0.3, 'setosa'],
       [5.4, 3.4, 1.7, 0.2, 'setosa'],
       [5.1, 3.7, 1.5, 0.4, 'setosa'],
       [4.6, 3.6, 1.0, 0.2, 'setosa'],
       [5.1, 3.3, 1.7, 0.5, 'setosa'],
       [4.8, 3.4, 1.9, 0.2, 'setosa'],
       [5.0, 3.0, 1.6, 0.

#### Task 5c: Make selections with `loc`

+ Select a single item with `at`.
+ Select a row slice with `loc`.
+ Select a row and column slice with `loc`.

In [21]:
slength = iris_df['sepal_length']
slength.head()

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal_length, dtype: float64

In [22]:
iris_df[0:1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [23]:
iris_df[0:5] 

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


#### Task 5d: Make selections with `iloc` and `iat`.

+ Select a single item with `iat`.
+ Select a row slice with `iloc`.
+ Select a row and column slice with `iloc`.


In [24]:
iris_df.loc[0]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

In [25]:
iris_df.loc[0:1]  # Get the first two rows.

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [26]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,1.287814,0.033583,0.276403,-0.01871
2013-01-02,1.905545,-0.300702,-2.157351,0.524162


In [27]:
df.loc['2013-01-01']

A    1.287814
B    0.033583
C    0.276403
D   -0.018710
Name: 2013-01-01 00:00:00, dtype: float64

In [28]:
df.loc['2013-01-01':'2013-01-02']

Unnamed: 0,A,B,C,D
2013-01-01,1.287814,0.033583,0.276403,-0.01871
2013-01-02,1.905545,-0.300702,-2.157351,0.524162


In [29]:
iris_df.loc[4:6, 'sepal_width': 'petal_width']

Unnamed: 0,sepal_width,petal_length,petal_width
4,3.6,1.4,0.2
5,3.9,1.7,0.4
6,3.4,1.4,0.3


In [30]:
iris_df.iloc[0]  # Get the first row.

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

In [31]:
iris_df.iloc[2:5, :-1] 

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


#### Task 5e: Boolean Indexing

Create subsets using boolean indexes that:
+ Use one boolean operator.
+ Use two boolean operators.


In [32]:
df_1

Unnamed: 0,alpha,beta
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [33]:
df_1 > 1.0

Unnamed: 0,alpha,beta
0,False,True
1,False,True
2,True,True
3,True,True
4,True,True


In [34]:
condition = iris_df['sepal_length'] > 5.8
iris_df[condition].head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
54,6.5,2.8,4.6,1.5,versicolor
56,6.3,3.3,4.7,1.6,versicolor


In [35]:
iris_df[iris_df['sepal_length'] > 5.8].head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
54,6.5,2.8,4.6,1.5,versicolor
56,6.3,3.3,4.7,1.6,versicolor
