# Pandas Tutorial

Pandas is a Python library providing high-performance, easy-to-use data structures and data analysis tools

Pandas deals with the following three data structures:

    Series
    DataFrame
    Panel (ignore this one for now)

### Importing the modules

In [1]:
import pandas as pd
import numpy as np

## Pandas.series
Series is a one-dimensional labeled array capable of holding data of one type.

pandas.Series(data, index, dtype, copy)

### Create empty series

In [None]:
s = pd.Series()
print(s)

### Create series from ndarray

In [None]:
data = np.array(['a','b','c','d']) 
s = pd.Series(data) #default indexing was used

print(data)
print('\n')
print(s)

In [None]:
print(s.index)
print(s.index.tolist())

In [None]:
s = pd.Series(data,index=['100','101','102','103'])

print(data)
print('\n')
print(s)

In [None]:
s.index.tolist()

### Create series from scalar

In [None]:
s = pd.Series(5, index=[0, 1, 2, 3, 4, 5, 23])
print(s)

### Create series from dictionary

In [None]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data)

print(data)
print('\n')
print(s)

In [None]:
data = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(data, index = ['a',1,2])

print(data)
print('\n')
print(s)

In [None]:
s.index

### Accessing elements with index

In [None]:
s = pd.Series([1,2,3,4,5],index = ('a','f','b','q','c'))

print(s)
print('\n')

print('Third element')
print(s[3])
print('\n')

print('Elements 2-4')
print(s[2:4])
print('\n')

print('The last 3 elements')
print(s[-3:])
print('\n')

print('Letter indexing')
print(s['a':'f'])

## Pandas.DataFrame
Two-dimensional data structure, columns can be of different data types (and usually are :))

pandas.DataFrame(data, index, columns, dtype)

### Create an empty dataframe

In [2]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


### Create dataframe from list

In [3]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data)
print(df)

   0
0  1
1  2
2  3
3  4
4  5


In [4]:
data = [1,2,3,4,5]
#df = pd.DataFrame(data)
df = pd.DataFrame(data, columns = ['first'])
print(df)

   first
0      1
1      2
2      3
3      4
4      5


In [5]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print(df)
df

     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13


Unnamed: 0,Name,Age
0,Alex,10
1,Bob,12
2,Clarke,13


### Create dataframe from a dictionary

In [6]:
data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
pd.DataFrame.from_dict(data)

Unnamed: 0,col_1,col_2
0,3,a
1,2,b
2,1,c
3,0,d


In [None]:
?pd.DataFrame.from_dict

In [7]:
data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
df1 = pd.DataFrame.from_dict(data, orient='index', 
                             columns=['c1', 'c2', 'c3', 'c4'])
df1

Unnamed: 0,c1,c2,c3,c4
row_1,3,2,1,0
row_2,a,b,c,d


### Give column names

In [11]:
df1

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [12]:
df1.columns.tolist()

['a', 'b', 'c']

In [13]:
df1.columns = ['A', 'B', 'C', 'D']
df1

ValueError: Length mismatch: Expected axis has 3 elements, new values have 4 elements

### Create dataframe from a list of dictionaries

In [None]:
?pd.DataFrame

In [10]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second']) 

print (df1)

        a   b     c
first   1   2   NaN
second  5  10  20.0


In [14]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b']) 

print (df1)

        a   b
first   1   2
second  5  10


In [15]:
#try to add 'c' to the list of columns
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df1 = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b', 'c', 'd']) 

print (df1)

        a   b     c   d
first   1   2   NaN NaN
second  5  10  20.0 NaN


### Selecting columns

In [16]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])

print(df)
print('\n')
print(df['Age'])
print(type(df['Age']))

     Name  Age  Year
0    Alex   10  2009
1     Bob   12  2011
2  Clarke   13  2012


0    10
1    12
2    13
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [17]:
print(df)
print(df[['Age']])
print(df[['Age', 'Name']])

     Name  Age  Year
0    Alex   10  2009
1     Bob   12  2011
2  Clarke   13  2012
   Age
0   10
1   12
2   13
   Age    Name
0   10    Alex
1   12     Bob
2   13  Clarke


### Selecting rows and columns

In [18]:
df

Unnamed: 0,Name,Age,Year
0,Alex,10,2009
1,Bob,12,2011
2,Clarke,13,2012


In [19]:
df['Name'][1]

'Bob'

### .loc, .iloc(use integer indexing) and .ix(the mix of 2, deprecated)

In [20]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])

#select all rows for specific columns
print(df)
print('\n')
print(df.loc[:,['A','B']])
print('\n')

          A         B         C         D
a  0.920640  0.681847 -0.041291  1.910754
b -1.424572  0.695444 -0.230115 -2.345288
c  0.001211 -1.161005  0.068430  0.694472
d -0.961605  0.549915  0.602913  1.026883
e  0.646732  1.097859  0.656254 -1.021591
f -0.045406 -1.903999 -1.432855  0.649996
g -0.139616  0.617180 -0.427537 -0.510511
h -0.132407 -0.174776  0.722189  0.591777


          A         B
a  0.920640  0.681847
b -1.424572  0.695444
c  0.001211 -1.161005
d -0.961605  0.549915
e  0.646732  1.097859
f -0.045406 -1.903999
g -0.139616  0.617180
h -0.132407 -0.174776




In [21]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


#select some rows for specific columns
print(df)
print('\n')
print(df.loc[['a','b'],['A','B']])
print('\n')



          A         B         C         D
a  0.347041  2.203882  1.269888 -0.851621
b  1.377184  0.327275 -1.023827 -0.609903
c  0.671920 -1.840480 -1.858506 -0.457273
d -0.669050  1.255037  0.398748 -0.406457
e  0.699414  2.063475  0.332611  0.052577
f  0.386889 -0.207339 -0.110225 -0.843773
g  1.117984  1.572012  0.738811  0.098514
h -0.628037 -3.088793 -0.222118 -3.041073


          A         B
a  0.347041  2.203882
b  1.377184  0.327275




In [26]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.loc['a']>0)
print('\n')

          A         B         C         D
a  0.711757  1.032390  0.629971 -0.031918
b  0.187873 -0.301208 -1.165179  0.506655
c -0.537232 -1.125397  1.324856  0.506730
d  0.435467  0.322140  1.140990 -0.370691
e -0.807435  0.204590  0.792884  0.487072
f  1.553019  0.313962  0.110580 -1.098266
g -0.730672  2.687357  1.127193 -1.548312
h  1.242458 -0.579200 -0.401448  0.263950


A     True
B     True
C     True
D    False
Name: a, dtype: bool




In [30]:
df.loc['a','B'] = 10000
df

Unnamed: 0,one,two,three,B
a,1.73197,-0.185435,-1.758453,10000.0
b,,,,
c,-1.495824,0.838944,-2.339888,
d,,,,
e,1.359262,0.303401,-2.223096,
f,1.848048,-0.833935,0.74953,
g,,,,
h,-1.025431,-0.42256,-1.204273,


In [31]:
df.loc['a','B'] = 10000
df

Unnamed: 0,one,two,three,B
a,1.73197,-0.185435,-1.758453,10000.0
b,,,,
c,-1.495824,0.838944,-2.339888,
d,,,,
e,1.359262,0.303401,-2.223096,
f,1.848048,-0.833935,0.74953,
g,,,,
h,-1.025431,-0.42256,-1.204273,


In [32]:
df['A'] = 10000
df

Unnamed: 0,one,two,three,B,A
a,1.73197,-0.185435,-1.758453,10000.0,10000
b,,,,,10000
c,-1.495824,0.838944,-2.339888,,10000
d,,,,,10000
e,1.359262,0.303401,-2.223096,,10000
f,1.848048,-0.833935,0.74953,,10000
g,,,,,10000
h,-1.025431,-0.42256,-1.204273,,10000


In [23]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])

print(df)
print('\n')
print(df.iloc[[0,1],[0,1]])
print('\n')

          A         B         C         D
a -0.454044 -0.529962  0.375148 -1.231102
b -0.007529  0.442100 -0.425603 -2.089572
c  0.012144  0.221389 -0.309095  0.441851
d -0.504378  0.676717  1.310177 -0.329823
e  2.205558  0.631460 -0.104212 -0.136577
f -0.066723 -0.663866  1.039719  1.636909
g  0.906864  1.880331  0.832925 -1.999228
h -0.313214 -1.363445  1.710916  0.548049


          A         B
a -0.454044 -0.529962
b -0.007529  0.442100




In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.iloc[0:3,[0,1]])
print('\n')

In [None]:
df = pd.DataFrame(np.random.randn(8, 4),
index = ['a','b','c','d','e','f','g','h'], columns = ['A', 'B', 'C', 'D'])


print(df)
print('\n')
print(df.ix[:,'A'])

### Filtering by a column condition

In [33]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])
df

Unnamed: 0,Name,Age,Year
0,Alex,10,2009
1,Bob,12,2011
2,Clarke,13,2012


In [None]:
print(df)
df[df['Age']==10]

In [None]:
print(df)
df[(df['Age']>10) & (df['Year']>2000)]

### Adding a column

In [34]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


df

Unnamed: 0,Name,Age,Year
0,Alex,10,2009
1,Bob,12,2011
2,Clarke,13,2012


In [35]:
df['Gender']=pd.Series(['male','male','male'])
df['Gender'] = 'male'
print(df)

     Name  Age  Year Gender
0    Alex   10  2009   male
1     Bob   12  2011   male
2  Clarke   13  2012   male


In [36]:
df['Gender'] = 1
df

Unnamed: 0,Name,Age,Year,Gender
0,Alex,10,2009,1
1,Bob,12,2011,1
2,Clarke,13,2012,1


In [None]:
len(df)

### Column deletion

In [None]:
data = [['Alex',10, 2009],['Bob',12, 2011],['Clarke',13, 2012]]
df = pd.DataFrame(data,columns=['Name','Age', 'Year'])


print(df)
print('\n')

del df['Year']
print(df)

### Slice rows

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'],index=[2,5,1,0,3])

print(df)
print('\n')
print(df[2:4])

## Some series and dataframe functions

In [None]:
np.random.randn(4)

In [37]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The axes are:")
print(s.axes)
print(list(s.axes[0]))
print(s.index)

0    0.551598
1    0.698987
2    1.609443
3   -1.066240
dtype: float64


The axes are:
[RangeIndex(start=0, stop=4, step=1)]
[0, 1, 2, 3]
RangeIndex(start=0, stop=4, step=1)


In [None]:
s = pd.Series(np.random.randn(4))
print(s)
print('\n')
print ("The data type is:")
print(s.dtype)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
print(df.dtypes)

In [38]:
print(s)
print('\n')
print ("Is the Object empty?")
print(s.empty)

0    0.551598
1    0.698987
2    1.609443
3   -1.066240
dtype: float64


Is the Object empty?
False


In [None]:
print(s)
print('\n')
print ("The dimensions of the object:")
print(s.ndim)

In [None]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Jane',16],['Anna',10]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
print(df.ndim)

In [42]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'])
print ("Our object is:")
print(df)
print('\n')

print ("The shape of the object is:")
print(df.shape)

Our object is:
     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13


The shape of the object is:
(3, 2)


In [None]:
print ("Our object is:")
print(df)

print('\n')
print('The first 2 rows of the dataframe:')
print(df.head(2))

print('\n')
print('The last 2 rows of the dataframe:')
print(df.tail(2))

In [None]:
?df.head

### Count the number of values in a column

In [39]:
data = [['Alex',10],['Bob',12],['Clarke',13],['Clarke',18]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

Unnamed: 0,Name,Age
0,Alex,10
1,Bob,12
2,Clarke,13
3,Clarke,18


In [40]:
x = df['Name'].value_counts()
print(type(x))
print(x)

<class 'pandas.core.series.Series'>
Clarke    2
Alex      1
Bob       1
Name: Name, dtype: int64


In [None]:
df.reindex()

# Missing values

In [27]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df)

        one       two     three
a  1.731970 -0.185435 -1.758453
b       NaN       NaN       NaN
c -1.495824  0.838944 -2.339888
d       NaN       NaN       NaN
e  1.359262  0.303401 -2.223096
f  1.848048 -0.833935  0.749530
g       NaN       NaN       NaN
h -1.025431 -0.422560 -1.204273


In [28]:
df['one'].isnull()

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [29]:
df['one'].notnull()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

# Replacing the missing data

In [None]:
df

In [None]:
df1 = df.fillna(method = 'bfill')
df1 

### See some other options that fillna() provides in the python documentation :)

In [None]:
?df.fillna

### Dropping the missing data

In [None]:
df2 = df.dropna()
df2

### Replacing regular values

In [None]:
df = pd.DataFrame({'one':[10,20,30,40,50,2000],'two':[1000,0,30,40,50,60]})

print(df)
print('\n')
df1 = df.replace({1000:10,2000:60})
df1

# Geting unique values in a column

In [None]:
import pandas as pd
data = [['Alex',10],['Bob',12],['Clarke',10],['Clarke',18]]
df = pd.DataFrame(data,columns=['Name','Age'])
df

In [None]:
df['Name']

In [None]:
#unique values
df['Name'].unique()

In [None]:
#number of unique values
df['Name'].nunique()

# Loading data from a file into a dataframe

In [41]:
s = pd.read_csv('Practical/data/gender.txt')
s

FileNotFoundError: [Errno 2] File b'Practical/data/gender.txt' does not exist: b'Practical/data/gender.txt'

In [None]:
?pd.read_csv

In [None]:
s = pd.read_csv('Practical/data/gender.txt', sep='|', index_col = 'user_id')
s