# Pandas Basics

**Importing the packages **

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Creating a dataframe **

In [35]:
df=pd.DataFrame(np.random.rand(10,4),columns=['A','B','C','D'])
print(df)

          A         B         C         D
0  0.447241  0.092563  0.390666  0.165740
1  0.078025  0.487562  0.015646  0.710851
2  0.202473  0.840701  0.356354  0.938117
3  0.880699  0.703532  0.367800  0.817343
4  0.145671  0.455787  0.318556  0.713309
5  0.732368  0.856005  0.123772  0.329510
6  0.366721  0.549770  0.968029  0.988155
7  0.511858  0.138775  0.469083  0.283919
8  0.180528  0.817463  0.415193  0.468177
9  0.108047  0.851526  0.206427  0.811078


**Creating a dataframe with dictionary **

In [3]:
df2 = pd.DataFrame({'A' : [10,20],'B' : [20,100],'C' : [30,40],'D' : [40,50]})
print(df2)

    A    B   C   D
0  10   20  30  40
1  20  100  40  50


**Looking at the variable types **

In [4]:
df2.dtypes

A    int64
B    int64
C    int64
D    int64
dtype: object

**looking at the top few values, default 5 **

In [5]:
df.head()

Unnamed: 0,A,B,C,D
0,0.812481,0.828163,0.829133,0.573248
1,0.554275,0.242232,0.248367,0.943396
2,0.042636,0.431814,0.493361,0.071007
3,0.269038,0.723922,0.990761,0.565874
4,0.4372,0.782153,0.839201,0.399984


**Looking at the last few values **

In [6]:
df.tail()

Unnamed: 0,A,B,C,D
5,0.03009,0.174562,0.291636,0.575884
6,0.806333,0.600426,0.372151,0.839255
7,0.640829,0.716863,0.982205,0.426393
8,0.400137,0.803911,0.014164,0.950382
9,0.097463,0.925163,0.814594,0.710517


**To get the indexes **


In [7]:
df.index

RangeIndex(start=0, stop=10, step=1)

**To get the column names **

In [8]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

**To get the values **

In [9]:
df.values

array([[0.81248056, 0.82816313, 0.82913336, 0.57324829],
       [0.55427537, 0.24223182, 0.24836657, 0.94339608],
       [0.04263634, 0.43181445, 0.49336144, 0.0710075 ],
       [0.26903837, 0.72392245, 0.99076074, 0.56587419],
       [0.43720015, 0.78215301, 0.83920077, 0.39998391],
       [0.03009044, 0.1745619 , 0.2916358 , 0.57588372],
       [0.80633301, 0.60042552, 0.37215123, 0.83925486],
       [0.64082949, 0.71686317, 0.98220509, 0.42639253],
       [0.40013717, 0.80391111, 0.01416427, 0.95038238],
       [0.09746345, 0.92516258, 0.8145935 , 0.71051732]])

**To get quick statistics about the data **

In [10]:
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.409048,0.622921,0.587557,0.605594
std,0.296429,0.256816,0.346125,0.271267
min,0.03009,0.174562,0.014164,0.071007
25%,0.140357,0.473967,0.311765,0.461263
50%,0.418669,0.720393,0.653977,0.574566
75%,0.619191,0.798472,0.836684,0.80707
max,0.812481,0.925163,0.990761,0.950382


**For sorting the index **

In [11]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
9,0.097463,0.925163,0.814594,0.710517
8,0.400137,0.803911,0.014164,0.950382
7,0.640829,0.716863,0.982205,0.426393
6,0.806333,0.600426,0.372151,0.839255
5,0.03009,0.174562,0.291636,0.575884
4,0.4372,0.782153,0.839201,0.399984
3,0.269038,0.723922,0.990761,0.565874
2,0.042636,0.431814,0.493361,0.071007
1,0.554275,0.242232,0.248367,0.943396
0,0.812481,0.828163,0.829133,0.573248


**For sorting by a column **

In [12]:
df.sort_values(by='A')

Unnamed: 0,A,B,C,D
5,0.03009,0.174562,0.291636,0.575884
2,0.042636,0.431814,0.493361,0.071007
9,0.097463,0.925163,0.814594,0.710517
3,0.269038,0.723922,0.990761,0.565874
8,0.400137,0.803911,0.014164,0.950382
4,0.4372,0.782153,0.839201,0.399984
1,0.554275,0.242232,0.248367,0.943396
7,0.640829,0.716863,0.982205,0.426393
6,0.806333,0.600426,0.372151,0.839255
0,0.812481,0.828163,0.829133,0.573248


## Selecting Values


**Selecting columns **

Note - This is also called a pandas series

In [38]:
df['A']

0    0.447241
1    0.078025
2    0.202473
3    0.880699
4    0.145671
5    0.732368
6    0.366721
7    0.511858
8    0.180528
9    0.108047
Name: A, dtype: float64

**Selecting range of rows **


In [14]:
df[0:2]

Unnamed: 0,A,B,C,D
0,0.812481,0.828163,0.829133,0.573248
1,0.554275,0.242232,0.248367,0.943396


**Selecting a particular row **


In [15]:
df.loc[0]

A    0.812481
B    0.828163
C    0.829133
D    0.573248
Name: 0, dtype: float64

**Selecting a row and column **

In [16]:
df.loc[0,['A']]

A    0.812481
Name: 0, dtype: float64

In [17]:
df.loc[:,['A']]

Unnamed: 0,A
0,0.812481
1,0.554275
2,0.042636
3,0.269038
4,0.4372
5,0.03009
6,0.806333
7,0.640829
8,0.400137
9,0.097463


**Using at instead for faster access , can't be used for a range **

In [18]:
df.at[0,'A']

0.81248056142876

**Selecting by position **

In [19]:
df.iloc[0]

A    0.812481
B    0.828163
C    0.829133
D    0.573248
Name: 0, dtype: float64

**Selecting by condition  **


In [20]:
df[df.A>0]

Unnamed: 0,A,B,C,D
0,0.812481,0.828163,0.829133,0.573248
1,0.554275,0.242232,0.248367,0.943396
2,0.042636,0.431814,0.493361,0.071007
3,0.269038,0.723922,0.990761,0.565874
4,0.4372,0.782153,0.839201,0.399984
5,0.03009,0.174562,0.291636,0.575884
6,0.806333,0.600426,0.372151,0.839255
7,0.640829,0.716863,0.982205,0.426393
8,0.400137,0.803911,0.014164,0.950382
9,0.097463,0.925163,0.814594,0.710517


## Missing Data

In [21]:
df2=df[df<.5]
print(df2)

          A         B         C         D
0       NaN       NaN       NaN       NaN
1       NaN  0.242232  0.248367       NaN
2  0.042636  0.431814  0.493361  0.071007
3  0.269038       NaN       NaN       NaN
4  0.437200       NaN       NaN  0.399984
5  0.030090  0.174562  0.291636       NaN
6       NaN       NaN  0.372151       NaN
7       NaN       NaN       NaN  0.426393
8  0.400137       NaN  0.014164       NaN
9  0.097463       NaN       NaN       NaN


**Drop all rows which have NaN **


In [22]:
df2.dropna()

Unnamed: 0,A,B,C,D
2,0.042636,0.431814,0.493361,0.071007


**Replace NaN with other values **


In [23]:
df2.fillna(value=1)

Unnamed: 0,A,B,C,D
0,1.0,1.0,1.0,1.0
1,1.0,0.242232,0.248367,1.0
2,0.042636,0.431814,0.493361,0.071007
3,0.269038,1.0,1.0,1.0
4,0.4372,1.0,1.0,0.399984
5,0.03009,0.174562,0.291636,1.0
6,1.0,1.0,0.372151,1.0
7,1.0,1.0,1.0,0.426393
8,0.400137,1.0,0.014164,1.0
9,0.097463,1.0,1.0,1.0


## Statistical Operations

**Taking mean **

In [24]:
df.mean()

A    0.409048
B    0.622921
C    0.587557
D    0.605594
dtype: float64

**Mean in other axis **

In [25]:
df.mean(1)

0    0.760756
1    0.497067
2    0.259705
3    0.637399
4    0.614634
5    0.268043
6    0.654541
7    0.691573
8    0.542149
9    0.636934
dtype: float64

**groupby function **

In [26]:
#Creating a dataframe
d = {'one':[1,1,2,1,1,1],
     'two':[2,2,2,2,2,2],
     'letter':['a','a','a','b','b','c']}

df = pd.DataFrame(d)
df

Unnamed: 0,letter,one,two
0,a,1,2
1,a,1,2
2,a,2,2
3,b,1,2
4,b,1,2
5,c,1,2


In [27]:
# Create group object
one = df.groupby('letter')

# Apply sum function
one.sum()

Unnamed: 0_level_0,one,two
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,6
b,2,4
c,1,2


In [28]:
one.sum().index

Index(['a', 'b', 'c'], dtype='object', name='letter')

In [29]:
letterone = df.groupby(['letter','one']).sum()
letterone

Unnamed: 0_level_0,Unnamed: 1_level_0,two
letter,one,Unnamed: 2_level_1
a,1,4
a,2,2
b,1,4
c,1,2


**To get more info about how data is grouped by **

In [30]:
letterone.index

MultiIndex(levels=[['a', 'b', 'c'], [1, 2]],
           labels=[[0, 0, 1, 2], [0, 1, 0, 0]],
           names=['letter', 'one'])

**You may want to not have the columns you are grouping by become your index, this can be easily achieved as shown below. **

In [31]:
letterone = df.groupby(['letter','one'], as_index=False).sum()
letterone

Unnamed: 0,letter,one,two
0,a,1,4
1,a,2,2
2,b,1,4
3,c,1,2


In [32]:
letterone.index

Int64Index([0, 1, 2, 3], dtype='int64')