# Pandas Tutorial

Pandas is a Python library providing high-performance, easy-to-use data structures and data analysis tools

Pandas deals with the following three data structures:

    Series
    DataFrame
    Panel (ignore this one for now)

### Importing the modules

In [1]:
import pandas as pd
import numpy as np

## Pandas.series
Series is a one-dimensional labeled array capable of holding data of one type.

pandas.Series(data, index, dtype, copy)

### Create empty series

In [2]:
s = pd.Series()
print(s)

Series([], dtype: float64)


### Create series from ndarray

In [4]:
data = np.array(['a','b','c','d']) 
s = pd.Series(data) #default indexing was used

print(data)
print('\n')
print(s)

['1' 'b' 'c' 'd']


0    1
1    b
2    c
3    d
dtype: object


In [5]:
print(s.index)
print(s.index.tolist())

RangeIndex(start=0, stop=4, step=1)
[0, 1, 2, 3]


In [6]:
s = pd.Series(data,index=['100','101','102','103'])

print(data)
print('\n')
print(s)

['1' 'b' 'c' 'd']


100    1
101    b
102    c
103    d
dtype: object


In [7]:
s.index.tolist()

['100', '101', '102', '103']

### Create series from scalar

In [8]:
s = pd.Series(5, index=[0, 1, 2, 3, 4, 5, 23])
print(s)

0     5
1     5
2     5
3     5
4     5
5     5
23    5
dtype: int64


### Create series from dictionary

In [11]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)

print(data)
print('\n')
print(s)

{'a': 0.0, 'b': 1.0, 'c': 2.0}


a    0.0
b    1.0
c    2.0
dtype: float64


In [12]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data, index = ['a',1,2])

print(data)
print('\n')
print(s)

{'a': 0.0, 'b': 1.0, 'c': 2.0}


a    0.0
1    NaN
2    NaN
dtype: float64


In [13]:
s.index

Index(['a', 1, 2], dtype='object')

### Accessing elements with index

In [17]:
s = pd.Series([1,2,3,4,5],index = ('a','f','b','q','c'))

print(s)
print('\n')

print('Third element')
print(s[3])
print('\n')

print('Elements 2-4')
print(s[2:4])
print('\n')

print('The last 3 elements')
print(s[-3:])
print('\n')

print('Letter indexing')
print(s['a':'f'])

a    1
f    2
b    3
q    4
c    5
dtype: int64


Third element
4


Elements 2-4
b    3
q    4
dtype: int64


The last 3 elements
b    3
q    4
c    5
dtype: int64


Letter indexing
a    1
f    2
dtype: int64


## Pandas.DataFrame
Two-dimensional data structure, columns can be of different data types (and usually are :))

pandas.DataFrame(data, index, columns, dtype)

### Create an empty dataframe

In [18]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


### Create dataframe from list

In [90]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data)
print(df)

   0
0  1
1  2
2  3
3  4
4  5


In [21]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data, columns = ['first'])
print(df)

   first
0      1
1      2
2      3
3      4
4      5


In [91]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)
df

     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13


Unnamed: 0,Name,Age
0,Alex,10
1,Bob,12
2,Clarke,13


In [93]:
df['Name'][1]

'Bob'

### Create dataframe from a dictionary

In [35]:
data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
pd.DataFrame.from_dict(data)

Unnamed: 0,col_1,col_2
0,3,a
1,2,b
2,1,c
3,0,d


In [37]:
?pd.DataFrame.from_dict

In [40]:
data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
df1 = pd.DataFrame.from_dict(data, orient='index', 
                             columns=['c1', 'c2', 'c3', 'c4'])
df1

Unnamed: 0,c1,c2,c3,c4
row_1,3,2,1,0
row_2,a,b,c,d


### Give column names

In [41]:
df1

Unnamed: 0,c1,c2,c3,c4
row_1,3,2,1,0
row_2,a,b,c,d


In [44]:
df1.columns.tolist()

['c1', 'c2', 'c3', 'c4']

In [45]:
df1.columns = ['A', 'B', 'C', 'D']
df1

Unnamed: 0,A,B,C,D
row_1,3,2,1,0
row_2,a,b,c,d


### Create dataframe from a list of dictionaries

In [47]:
?pd.DataFrame

In [46]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second']) 

print (df1)

        a   b     c
first   1   2   NaN
second  5  10  20.0


In [47]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b']) 

print (df1)

        a   b
first   1   2
second  5  10


In [51]:
#try to add 'c' to the list of columns
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b', 'c', 'd']) 

print (df1)

        a   b     c   d
first   1   2   NaN NaN
second  5  10  20.0 NaN


### Selecting columns

In [53]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])

print(df)
print('\n')
print(df['Age'])
print(type(df['Age']))

     Name  Age  Year
0    Alex   10  2009
1     Bob   12  2011
2  Clarke   13  2012


0    10
1    12
2    13
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [55]:
print(df)
print(df[['Age']])
print(df[['Age', 'Name']])

     Name  Age  Year
0    Alex   10  2009
1     Bob   12  2011
2  Clarke   13  2012
   Age
0   10
1   12
2   13
   Age    Name
0   10    Alex
1   12     Bob
2   13  Clarke


### Selecting rows and columns

In [56]:
df

Unnamed: 0,Name,Age,Year
0,Alex,10,2009
1,Bob,12,2011
2,Clarke,13,2012


In [57]:
df.loc[df.index[[0, 2]], ['Age', 'Year']]

Unnamed: 0,Age,Year
0,10,2009
2,13,2012


In [58]:
df.loc[df.index[:2], ['Age', 'Year']]

Unnamed: 0,Age,Year
0,10,2009
1,12,2011


### Adding a condition on a column

In [59]:
df

Unnamed: 0,Name,Age,Year
0,Alex,10,2009
1,Bob,12,2011
2,Clarke,13,2012


In [60]:
print(df)
df[df['Age']==10]

     Name  Age  Year
0    Alex   10  2009
1     Bob   12  2011
2  Clarke   13  2012


Unnamed: 0,Name,Age,Year
0,Alex,10,2009


In [61]:
print(df)
df[(df['Age']>10) & (df['Year']>2000)]

     Name  Age  Year
0    Alex   10  2009
1     Bob   12  2011
2  Clarke   13  2012


Unnamed: 0,Name,Age,Year
1,Bob,12,2011
2,Clarke,13,2012


### Adding a column

In [62]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


df

Unnamed: 0,Name,Age,Year
0,Alex,10,2009
1,Bob,12,2011
2,Clarke,13,2012


In [63]:
df['Gender']=pd.Series(['male','male','male'])
df['Gender'] = 'male'
print(df)

     Name  Age  Year Gender
0    Alex   10  2009   male
1     Bob   12  2011   male
2  Clarke   13  2012   male


In [64]:
df['Gender'] = 1
df

Unnamed: 0,Name,Age,Year,Gender
0,Alex,10,2009,1
1,Bob,12,2011,1
2,Clarke,13,2012,1


In [30]:
len(df)

3

### Column deletion

In [65]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


print(df)
print('\n')

del df['Year']
print(df)

     Name  Age  Year
0    Alex   10  2009
1     Bob   12  2011
2  Clarke   13  2012


     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13


### Row selection

In [67]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'], index = ['a','b','c'])
print(df)
print('\n')

print(df.loc['a'].tolist()) #df.iloc[1] only for integer indexes

     Name  Age
a    Alex   10
b     Bob   12
c  Clarke   13


['Alex', 10]


### Slice rows

In [68]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'],index=[2,5,1,0,3])

print(df)
print('\n')
print(df[2:4])

     Name  Age
2    Alex   10
5     Bob   12
1  Clarke   13
0    Jane   16
3    Anna   10


     Name  Age
1  Clarke   13
0    Jane   16


In [71]:
np.arange(len(df))

array([0, 1, 2, 3, 4])

In [72]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])

print(df)
print('\n')

df2 = pd.DataFrame([['Alice',10],['Tom',12]], columns = ['Name','Age'])

print(df2)
print('\n')

df = df.append(df2)
print(df)
print('\n')
###note!  df.loc[0]
#print(df.loc[0])

df = df.drop(1)
print('after drop',df)

df.index=np.arange(len(df))
df

     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13
3    Jane   16
4    Anna   10


    Name  Age
0  Alice   10
1    Tom   12


     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13
3    Jane   16
4    Anna   10
0   Alice   10
1     Tom   12


after drop      Name  Age
0    Alex   10
2  Clarke   13
3    Jane   16
4    Anna   10
0   Alice   10


Unnamed: 0,Name,Age
0,Alex,10
1,Clarke,13
2,Jane,16
3,Anna,10
4,Alice,10


In [32]:
np.arange(len(df))

array([0, 1, 2, 3, 4])

In [73]:
df.reindex()

Unnamed: 0,Name,Age
0,Alex,10
1,Clarke,13
2,Jane,16
3,Anna,10
4,Alice,10


## Some series and dataframe functions

In [74]:
np.random.randn(4)

array([-2.61846402,  0.36923   ,  0.07484757, -0.22063801])

In [77]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The axes are:")
print(s.axes)
print(list(s.axes[0]))
print(s.index)

0   -2.310921
1    1.142992
2    0.658577
3    0.613716
dtype: float64


The axes are:
[RangeIndex(start=0, stop=4, step=1)]
[0, 1, 2, 3]
RangeIndex(start=0, stop=4, step=1)


In [78]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The data type is:")
print(s.dtype)

0    0.093197
1    0.515931
2    1.995522
3    0.688197
dtype: float64


The data type is:
float64


In [79]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

Unnamed: 0,Name,Age
0,Alex,10
1,Bob,12
2,Clarke,13
3,Jane,16
4,Anna,10


In [102]:
print(df.dtypes)

Name    object
Age      int64
dtype: object


In [80]:
print(s)
print('\n')
print ("Is the Object empty?")
print(s.empty)

0    0.093197
1    0.515931
2    1.995522
3    0.688197
dtype: float64


Is the Object empty?
False


In [81]:
print(s)
print('\n')
print ("The dimensions of the object:")
print(s.ndim)

0    0.093197
1    0.515931
2    1.995522
3    0.688197
dtype: float64


The dimensions of the object:
1


In [82]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

Unnamed: 0,Name,Age
0,Alex,10
1,Bob,12
2,Clarke,13
3,Jane,16
4,Anna,10


In [83]:
print(df.ndim)

2


In [84]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print ("Our object is:")
print(df)
print('\n')

print ("The shape of the object is:")
print(df.shape)

Our object is:
     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13


The shape of the object is:
(3, 2)


In [86]:
print ("Our object is:")
print(df)

print('\n')
print('The first 2 rows of the dataframe:')
print(df.head(2))

print('\n')
print('The last 2 rows of the dataframe:')
print(df.tail(2))

Our object is:
     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13


The first 2 rows of the dataframe:
   Name  Age
0  Alex   10
1   Bob   12


The last 2 rows of the dataframe:
     Name  Age
1     Bob   12
2  Clarke   13


In [87]:
?df.head

### Count the number of values in a column

In [88]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Clarke',18]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

Unnamed: 0,Name,Age
0,Alex,10
1,Bob,12
2,Clarke,13
3,Clarke,18


In [89]:
x = df['Name'].value_counts()
print(type(x))
print(x)

<class 'pandas.core.series.Series'>
Clarke    2
Alex      1
Bob       1
Name: Name, dtype: int64
