# Pandas

Installation of pandas --

conda install pandas

In [31]:
import numpy as np
import pandas as pd
pd.__version__

'0.21.0'

## DataStructures

### Series - is a one-dimensional array of indexed data. 

In [5]:
# It can be created from a list or array as follows:
    
test_series = pd.Series([1,2,3,4])
test_series

0    1
1    2
2    3
3    4
dtype: int64

In [6]:
test_series.index   # Returns Index objects. 

RangeIndex(start=0, stop=4, step=1)

In [8]:
test_series.values  # Returns the values of the Series

array([1, 2, 3, 4], dtype=int64)

In [9]:
# index could be passed as a parameter -> which acts as the row index.
test_series = pd.Series([1,2,3,4], index=['first', 'second', 'third', 'fourth'])
test_series

first     1
second    2
third     3
fourth    4
dtype: int64

In [10]:
# Index objects are immutable and thus can’t be modified by the user:
test_series.index[1] = '2nd'

TypeError: Index does not support mutable operations

In [14]:
# Creating a Series object from Dict

dict_weight = {'person1': 80, 'person2': 72, 'person3': 65}

series_weight = pd.Series(dict_weight)
series_weight

person1    80
person2    72
person3    65
dtype: int64

In [15]:
series_weight.index

Index(['person1', 'person2', 'person3'], dtype='object')

In [16]:
series_weight.values

array([80, 72, 65], dtype=int64)

#### Accessing Data

Since this is like list, position index would work normally.

This also follows zero-based position index.

In [19]:
test_series[:4]  # First 4 elements

first     1
second    2
third     3
fourth    4
dtype: int64

In [20]:
test_series[1:4]  # Range index on position

second    2
third     3
fourth    4
dtype: int64

In [21]:
test_series[:-1]   # Access all elements except the last one.

first     1
second    2
third     3
dtype: int64

In [22]:
test_series[['first','second']]  # Since this is like list, label index would work normally

first     1
second    2
dtype: int64

In [23]:
# Boolean index
test_series[test_series < 3]

first     1
second    2
dtype: int64

In [24]:
# Range Index on label
test_series['second': 'fourth']

second    2
third     3
fourth    4
dtype: int64

In [25]:
# If we try to access a label which is not present in the index, it will throw KeyError
test_series['fifth']

KeyError: 'fifth'

#### loc and iloc

In [26]:
# Pandas provides another way of accessing data using loc and iloc.

test_series.loc['first']   # We need to mention the label index

1

In [27]:
test_series.iloc[1]    # Mention positional index

2

### DataFrame -- 2-D representation. Similar to a table.

A DataFrame represents a tabular, spreadsheet-like data structure containing an ordered
collection of columns, each of which can be a different value type (numeric,
string, boolean, etc.). The DataFrame has both a row and column index; it can be
thought of as a dict of Series (one for all sharing the same index).

In [32]:
test_dataframe = pd.DataFrame(np.arange(16).reshape((4, 4)),              
                index=['row'+str(t) for t in range(4)],
                columns=['col'+str(t) for t in range(4)])

test_dataframe

Unnamed: 0,col0,col1,col2,col3
row0,0,1,2,3
row1,4,5,6,7
row2,8,9,10,11
row3,12,13,14,15


#### Creating DataFrame from a dictionary

In [33]:
dict_random_data = {'first': np.random.random(5),
        'second': np.random.randint(0,10,5),
        'third': np.random.randn(5)}
dict_dataframe = pd.DataFrame(dict_random_data)
dict_dataframe

Unnamed: 0,first,second,third
0,0.468424,1,0.289779
1,0.637968,6,-0.134224
2,0.641681,1,1.495191
3,0.009077,8,0.258397
4,0.845548,3,-1.368392


In [34]:
dict_dataframe.columns

Index(['first', 'second', 'third'], dtype='object')

#### Accessing Data

In [36]:
test_dataframe['col1']   # By default, it checks the column index and not row index.

row0     1
row1     5
row2     9
row3    13
Name: col1, dtype: int32

In [35]:
test_dataframe[['col1', 'col2', 'col3']]

Unnamed: 0,col1,col2,col3
row0,1,2,3
row1,5,6,7
row2,9,10,11
row3,13,14,15


**_ Accessing data using loc and iloc. _**

In [37]:
# Accessing first few rows of the first column.
test_dataframe.loc['row1':'row4', 'col1']

row1     5
row2     9
row3    13
Name: col1, dtype: int32

In [42]:
test_dataframe.loc[:, 0]   # Error. We need to mention the label when using loc

TypeError: cannot do label indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [0] of <class 'int'>

In [41]:
test_dataframe.loc[:, 'col1'] 

row0     1
row1     5
row2     9
row3    13
Name: col1, dtype: int32

In [39]:
test_dataframe.loc[1:4, 'col1']  # Error - need to mention the row index and not position index

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>

In [43]:
test_dataframe.loc['row1':'row4', 'col1']

row1     5
row2     9
row3    13
Name: col1, dtype: int32

In [44]:
# Accessing using position index - first few rows of the first column.
test_dataframe.iloc[1:4, 1:2]

Unnamed: 0,col1
row1,5
row2,9
row3,13


In [45]:
# First 3 rows by position and first 2 columns by name/label
test_dataframe.iloc[0:3].loc[: , ['col1','col3']]

Unnamed: 0,col1,col3
row0,1,3
row1,5,7
row2,9,11


In [46]:
# Slicing using logical indices
test_dataframe[test_dataframe['col1'] > 5] 

Unnamed: 0,col0,col1,col2,col3
row2,8,9,10,11
row3,12,13,14,15


In [47]:
# Assinging values using some condition

test_dataframe[test_dataframe < 5] = 0
test_dataframe

Unnamed: 0,col0,col1,col2,col3
row0,0,0,0,0
row1,0,5,6,7
row2,8,9,10,11
row3,12,13,14,15


#### Editing the DataFrame

In [53]:
# Adding new column
test_dataframe['col4'] = 5
test_dataframe.columns
test_dataframe

Unnamed: 0,col0,col1,col2,col3,col4
row0,0,0,0,0,5
row1,0,5,6,7,5
row2,8,9,10,11,5
row3,12,13,14,15,5


In [54]:
# Removing a column using del (delete)
del test_dataframe['col4']

In [55]:
test_dataframe.columns
test_dataframe

Unnamed: 0,col0,col1,col2,col3
row0,0,0,0,0
row1,0,5,6,7
row2,8,9,10,11
row3,12,13,14,15


In [None]:
# Removing using drop

test_dataframe.drop(['row1', 'row2'])

In [None]:
test_dataframe.drop(['col1', 'col3'], axis=1)   # axis = 1 means columns and 0 means rows (default)

#### Few common functions used while data analysis

In [57]:
test_dataframe = pd.DataFrame(np.arange(16).reshape((4, 4)),              
                index=['row'+str(t) for t in range(4)],
                columns=['col'+str(t) for t in range(4)])

test_dataframe

Unnamed: 0,col0,col1,col2,col3
row0,0,1,2,3
row1,4,5,6,7
row2,8,9,10,11
row3,12,13,14,15


#### Apply function

In [58]:
func_inv_sum = lambda x: 1/np.sum(x)

In [59]:
# Column-wise operation/across rows -> Finding inverse of sum of values.
test_dataframe.apply(func_inv_sum)

col0    0.041667
col1    0.035714
col2    0.031250
col3    0.027778
dtype: float64

In [60]:
# Row-wise operation/across columns -> Finding inverse of sum of values.
test_dataframe.apply(func_inv_sum, axis=1)

row0    0.166667
row1    0.045455
row2    0.026316
row3    0.018519
dtype: float64

#### Sorting

In [62]:
temp_series = pd.Series([14,18,91,12], index=[2,3,4,1])
temp_series

2    14
3    18
4    91
1    12
dtype: int64

In [63]:
temp_series.sort_index()

1    12
2    14
3    18
4    91
dtype: int64

In [64]:
temp_series.sort_index(ascending=False)

4    91
3    18
2    14
1    12
dtype: int64

In [65]:
temp_series.sort_values()

1    12
2    14
3    18
4    91
dtype: int64

In [None]:
test_dataframe = pd.DataFrame(np.arange(16).reshape((4, 4)), 
                  index=['p', 'r','a','m'],
                  columns=['first', 'second', 'third', 'fourth'])

test_dataframe

In [None]:
test_dataframe.sort_index()

In [None]:
test_dataframe.sort_values(by='fourth', ascending=False)

#### Statistical functions

In [66]:
test_dataframe = pd.DataFrame(np.arange(16).reshape((4, 4)),              
                index=['row'+str(t) for t in range(4)],
                columns=['col'+str(t) for t in range(4)])

test_dataframe

Unnamed: 0,col0,col1,col2,col3
row0,0,1,2,3
row1,4,5,6,7
row2,8,9,10,11
row3,12,13,14,15


In [69]:
test_dataframe.sum()    # Column wise / Across rows

col0    24
col1    28
col2    32
col3    36
dtype: int64

In [68]:
test_dataframe.sum(axis=1)   # Row-wise / Across columns

row0     6
row1    22
row2    38
row3    54
dtype: int64

In [70]:
test_dataframe.mean(axis=1)

row0     1.5
row1     5.5
row2     9.5
row3    13.5
dtype: float64

In [72]:
test_dataframe.idxmax()   # Find out the index which has the max value

col0    row3
col1    row3
col2    row3
col3    row3
dtype: object

#### Concatenation  -- Row wise (Adding Rows)  - Similar to UNION set function

In [74]:
df_student_1 = pd.DataFrame({ 'ID' : [1,2],
                            'Name': ['student1','student2'],
                           'Score': [75, 95]},
                           index = [0,1])

df_student_1

Unnamed: 0,ID,Name,Score
0,1,student1,75
1,2,student2,95


In [75]:
df_student_2 = pd.DataFrame({ 'ID' : [3,4],
                            'Name': ['student3','student4'],
                           'Score': [85, 97]},
                           index = [2,3])

df_student_2

Unnamed: 0,ID,Name,Score
2,3,student3,85
3,4,student4,97


In [78]:
df_student_concat = pd.concat([df_student_1, df_student_2])
df_student_concat

Unnamed: 0,ID,Name,Score
0,1,student1,75
1,2,student2,95
2,3,student3,85
3,4,student4,97


In [79]:
# Concatenation  -- Column wise (Adding columns)

df_course_1 = pd.DataFrame({ 'Course_Name' : ['Maths', 'Science', 'Language', 'Maths']},
                           index = [0,1,2,3])

In [80]:
pd.concat([df_student_concat, df_course_1], axis=1)

Unnamed: 0,ID,Name,Score,Course_Name
0,1,student1,75,Maths
1,2,student2,95,Science
2,3,student3,85,Language
3,4,student4,97,Maths


#### Merging/Joining

In [83]:
# Merge DataFrame objects by performing a database-style join operation by columns or indexes.

df_student_1 = pd.DataFrame({ 'ID' : [1,2,3,4,6],
                            'Name': ['student1','student2','student3','student4','student5'],
                           'Score': [75, 95,85,97,100]},
                           index = [0,1,2,3,4])
df_student_1

df_course_1 = pd.DataFrame({ 'ID' : [1,2,3,4,5],
                            'Course_Name' : ['Maths', 'Science', 'Language', 'Maths','Economics']},
                           index = [0,1,2,3,5])

df_course_1

Unnamed: 0,Course_Name,ID
0,Maths,1
1,Science,2
2,Language,3
3,Maths,4
5,Economics,5


In [84]:
# If we want to show data which has common ID in both the dataframe
pd.merge(df_student_1,df_course_1,how='inner',on='ID')

Unnamed: 0,ID,Name,Score,Course_Name
0,1,student1,75,Maths
1,2,student2,95,Science
2,3,student3,85,Language
3,4,student4,97,Maths


In [85]:
# If we want to show data which has all IDs as per the student dataframe
pd.merge(df_student_1,df_course_1,how='left',on='ID')

Unnamed: 0,ID,Name,Score,Course_Name
0,1,student1,75,Maths
1,2,student2,95,Science
2,3,student3,85,Language
3,4,student4,97,Maths
4,6,student5,100,


In [86]:
# If we want to show data which has all IDs.
pd.merge(df_student_1,df_course_1,how='right',on='ID')

Unnamed: 0,ID,Name,Score,Course_Name
0,1,student1,75.0,Maths
1,2,student2,95.0,Science
2,3,student3,85.0,Language
3,4,student4,97.0,Maths
4,5,,,Economics


In [None]:
# If we want to show data which has all IDs irrespective of either student or  course dataframe

pd.merge(df_student_1,df_course_1,how='outer',on='ID')

DataFrames provide the pandas.DataFrame.join() methods as a convenient way to access the capabilities of pandas.merge().

### Group By

In [88]:
df_sales = pd.DataFrame(
            {'Category':['Electronics','Electronics','Electronics','Furniture','Furniture','Kids'],
           'Product':['Mobiles','TV','Laptops','Office Chairs','Toys','School Bags'],
           'Sales':[1000,300,500,97,49,39]})

df_sales

Unnamed: 0,Category,Product,Sales
0,Electronics,Mobiles,1000
1,Electronics,TV,300
2,Electronics,Laptops,500
3,Furniture,Office Chairs,97
4,Furniture,Toys,49
5,Kids,School Bags,39


In [93]:
sales_grpby_category = df_sales.groupby('Category')
sales_grpby_category   # grouby object

<pandas.core.groupby.DataFrameGroupBy object at 0x000001E85B00B550>

In [94]:
# Total Sales
sales_grpby_category.sum()

Unnamed: 0_level_0,Sales
Category,Unnamed: 1_level_1
Electronics,1800
Furniture,146
Kids,39


In [None]:
# Average Sales
sales_grpby_category.mean()

## Reading data from csv

It can also read data from excel, and other permissible file type (SAS, Stata files etc.)

In [97]:
df_sales = pd.read_csv('data\product_sales.csv', header=0)
pd.read_csv('data//product_sales.csv', header=0)

Unnamed: 0,Category,Product,Brand,Sales
0,Electronics,Mobiles,Samsung,1200
1,Electronics,Mobiles,Apple,900
2,Electronics,Mobiles,Oppo,2000
3,Electronics,TV,Mi,900
4,Electronics,TV,Sony,600
5,Electronics,Laptops,HP,800
6,Electronics,Laptops,Acer,500
7,Furniture,Chairs,Featherlite,400
8,Furniture,Chairs,Nilkamal,300
9,Furniture,Sofas,UrbanLiving,100


In [98]:
# Quick exploration of the data
df_sales.head()

Unnamed: 0,Category,Product,Brand,Sales
0,Electronics,Mobiles,Samsung,1200
1,Electronics,Mobiles,Apple,900
2,Electronics,Mobiles,Oppo,2000
3,Electronics,TV,Mi,900
4,Electronics,TV,Sony,600


In [101]:
sales_grpby_category = df_sales.groupby('Category')
sales_grpby_category

<pandas.core.groupby.DataFrameGroupBy object at 0x000001E85B029E80>

In [102]:
sales_grpby_category.sum()
sales_grpby_category['Sales'].sum()

Category
Electronics    6900
Furniture       840
Kids           1400
Name: Sales, dtype: int64

In [None]:
sales_grpby_product = df_sales.groupby('Product')


sales_grpby_product.mean()
sales_grpby_product['Sales'].mean()

## Missing data

In most of the real-life use cases, we dont get complete or neat data. We will encounter missing data either because the 
system caputuring data was not available while collecting the data or the source could not send the data, or user forgot/missed
to enter the data.

We may have to handle this data as these are legitimate cases to handle.

In [None]:
# pandas uses the floating point value NaN (Not a Number) to represent missing data in both floating as well as
# in non-floating point arrays.

# The built-in Python "None" value is also treated as NA in object arrays.    

In [107]:
temp_series = pd.Series([1,2,3,np.nan])
temp_series    # dtype float. Casted from int to float.

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [108]:
temp_series = pd.Series([1,2,3,None])
temp_series    # None is casted to NaN. Float Series

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [109]:
temp_series = pd.Series(['a','b','c',None])
temp_series   # None itself. Object Series.

0       a
1       b
2       c
3    None
dtype: object

### Detecting Nulls

In [110]:
temp_series.isnull()   # ehecking each element whether it is null. Returns boolean array.

0    False
1    False
2    False
3     True
dtype: bool

In [111]:
temp_series.notnull()  # Negation of isnull. checking whether it is not null

0     True
1     True
2     True
3    False
dtype: bool

__ Once we have missing data, we can retain it as is, remove those rows/columns which have null values or impute/fill those with some logical value. __

In [112]:
# Filtering Out using dropna()

In [114]:
temp_series = pd.Series([5, np.nan,10, 17, np.nan, 27])
temp_series.dropna()

0     5.0
2    10.0
3    17.0
5    27.0
dtype: float64

In [113]:
temp_df = pd.DataFrame([[10, 45, 30], 
                     [13, np.nan, np.nan],
                     [np.nan, np.nan, np.nan], 
                     [np.nan, 35, 43]], index=np.arange(4),
                    columns=['col1','col2','col3'])
    

temp_df

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
1,13.0,,
2,,,
3,,35.0,43.0


In [117]:
drop_all = temp_df.dropna()  # Drops all the rows and columns having null.
drop_all

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0


In [118]:
drop_columns = temp_df.dropna(axis="columns")   # axis=1
drop_columns

0
1
2
3


In [119]:
drop_rows = temp_df.dropna(axis='rows')  # axis=0
drop_rows   # Drop all rows with 

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0


In [120]:
drop_rows = temp_df.dropna(how='all')  # all cells of a row should be null
drop_rows

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
1,13.0,,
3,,35.0,43.0


In [121]:
drop_rows = temp_df.dropna(how='any')  # any of the cell of a row should be null
drop_rows

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0


In [122]:
drop_rows = temp_df.dropna(thresh=2)  # The row should atleast have 2 non-null values.
drop_rows

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
3,,35.0,43.0


#### Another way of handling missing data is to impute/fill with logical values

In [123]:
temp_df.fillna(0)   # Filling all missing values with 0

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
1,13.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,35.0,43.0


In [124]:
temp_df.fillna({'col1': 12, 'col2': 40})

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
1,13.0,40.0,
2,12.0,40.0,
3,12.0,35.0,43.0


In [125]:
# We can also forward fill the null value with the last known value encountered before null value(s)
temp_df.fillna(method='ffill')

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
1,13.0,45.0,30.0
2,13.0,45.0,30.0
3,13.0,35.0,43.0


In [126]:
# This function backfills the null value with the first known value encountered after null value(s).
temp_df.fillna(method='backfill')

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
1,13.0,35.0,43.0
2,,35.0,43.0
3,,35.0,43.0


In [129]:
# By default, these fillna function does not overwrite the existing dataframe and creates a new object.
# To overwrite the same dataframe, we can pass on inplace=True paramater.

temp_df

temp_df.fillna(0, inplace=True)
temp_df

# Frequency data  - value_counts()

Unnamed: 0,col1,col2,col3
0,10.0,45.0,30.0
1,13.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,35.0,43.0
