# Pandas Basics

**Importing the packages **

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Creating a dataframe **

In [2]:
df=pd.DataFrame(np.random.rand(10,4),columns=['A','B','C','D'])
print(df)

          A         B         C         D
0  0.799568  0.503289  0.885045  0.453443
1  0.231304  0.012980  0.102424  0.176636
2  0.323899  0.481525  0.090690  0.892677
3  0.247167  0.879233  0.032647  0.440945
4  0.084348  0.402155  0.257733  0.677410
5  0.440435  0.498661  0.569819  0.247461
6  0.022900  0.628090  0.608290  0.580947
7  0.115451  0.036736  0.534478  0.060111
8  0.397987  0.285723  0.372840  0.393917
9  0.882694  0.598450  0.391790  0.484944


**Creating a dataframe with dictionary **

In [3]:
df2 = pd.DataFrame({'A' : [10,20],'B' : [20,100],'C' : [30,40],'D' : [40,50]})
print(df2)

    A    B   C   D
0  10   20  30  40
1  20  100  40  50


**Looking at the variable types **

In [4]:
df2.dtypes

A    int64
B    int64
C    int64
D    int64
dtype: object

**looking at the top few values, default 5 **

In [5]:
df.head()

Unnamed: 0,A,B,C,D
0,0.799568,0.503289,0.885045,0.453443
1,0.231304,0.01298,0.102424,0.176636
2,0.323899,0.481525,0.09069,0.892677
3,0.247167,0.879233,0.032647,0.440945
4,0.084348,0.402155,0.257733,0.67741


**Looking at the last few values **

In [6]:
df.tail()

Unnamed: 0,A,B,C,D
5,0.440435,0.498661,0.569819,0.247461
6,0.0229,0.62809,0.60829,0.580947
7,0.115451,0.036736,0.534478,0.060111
8,0.397987,0.285723,0.37284,0.393917
9,0.882694,0.59845,0.39179,0.484944


**To get the indexes **


In [7]:
df.index

RangeIndex(start=0, stop=10, step=1)

**To get the column names **

In [8]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

**To get the values **

In [9]:
df.values

array([[0.79956765, 0.50328863, 0.88504532, 0.45344291],
       [0.23130372, 0.01298   , 0.10242414, 0.17663606],
       [0.32389932, 0.48152532, 0.09068981, 0.89267703],
       [0.24716701, 0.87923266, 0.03264696, 0.44094514],
       [0.08434782, 0.40215547, 0.25773331, 0.67741042],
       [0.44043475, 0.49866138, 0.56981945, 0.24746106],
       [0.02290036, 0.62809018, 0.60828953, 0.58094722],
       [0.11545064, 0.03673577, 0.53447755, 0.06011148],
       [0.39798749, 0.28572269, 0.37283958, 0.39391733],
       [0.88269401, 0.59845035, 0.39179022, 0.48494441]])

**To get quick statistics about the data **

In [10]:
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.354575,0.432684,0.384576,0.440849
std,0.289457,0.265047,0.271501,0.244088
min,0.0229,0.01298,0.032647,0.060111
25%,0.144414,0.314831,0.141251,0.284075
50%,0.285533,0.490093,0.382315,0.447194
75%,0.429823,0.57466,0.560984,0.556947
max,0.882694,0.879233,0.885045,0.892677


**For sorting the index **

In [11]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
9,0.882694,0.59845,0.39179,0.484944
8,0.397987,0.285723,0.37284,0.393917
7,0.115451,0.036736,0.534478,0.060111
6,0.0229,0.62809,0.60829,0.580947
5,0.440435,0.498661,0.569819,0.247461
4,0.084348,0.402155,0.257733,0.67741
3,0.247167,0.879233,0.032647,0.440945
2,0.323899,0.481525,0.09069,0.892677
1,0.231304,0.01298,0.102424,0.176636
0,0.799568,0.503289,0.885045,0.453443


**For sorting by a column **

In [12]:
df.sort_values(by='A')

Unnamed: 0,A,B,C,D
6,0.0229,0.62809,0.60829,0.580947
4,0.084348,0.402155,0.257733,0.67741
7,0.115451,0.036736,0.534478,0.060111
1,0.231304,0.01298,0.102424,0.176636
3,0.247167,0.879233,0.032647,0.440945
2,0.323899,0.481525,0.09069,0.892677
8,0.397987,0.285723,0.37284,0.393917
5,0.440435,0.498661,0.569819,0.247461
0,0.799568,0.503289,0.885045,0.453443
9,0.882694,0.59845,0.39179,0.484944


## Selecting Values


**Selecting columns **

In [13]:
df['A']

0    0.799568
1    0.231304
2    0.323899
3    0.247167
4    0.084348
5    0.440435
6    0.022900
7    0.115451
8    0.397987
9    0.882694
Name: A, dtype: float64

**Selecting range of rows **


In [14]:
df[0:2]

Unnamed: 0,A,B,C,D
0,0.799568,0.503289,0.885045,0.453443
1,0.231304,0.01298,0.102424,0.176636


**Selecting a particular row **


In [15]:
df.loc[0]

A    0.799568
B    0.503289
C    0.885045
D    0.453443
Name: 0, dtype: float64

**Selecting a row and column **

In [16]:
df.loc[0,['A']]

A    0.799568
Name: 0, dtype: float64

In [17]:
df.loc[:,['A']]

Unnamed: 0,A
0,0.799568
1,0.231304
2,0.323899
3,0.247167
4,0.084348
5,0.440435
6,0.0229
7,0.115451
8,0.397987
9,0.882694


**Using at instead for faster access , can't be used for a range **

In [18]:
df.at[0,'A']

0.7995676500056373

**Selecting by position **

In [19]:
df.iloc[0]

A    0.799568
B    0.503289
C    0.885045
D    0.453443
Name: 0, dtype: float64

**Selecting by condition  **


In [20]:
df[df.A>0]

Unnamed: 0,A,B,C,D
0,0.799568,0.503289,0.885045,0.453443
1,0.231304,0.01298,0.102424,0.176636
2,0.323899,0.481525,0.09069,0.892677
3,0.247167,0.879233,0.032647,0.440945
4,0.084348,0.402155,0.257733,0.67741
5,0.440435,0.498661,0.569819,0.247461
6,0.0229,0.62809,0.60829,0.580947
7,0.115451,0.036736,0.534478,0.060111
8,0.397987,0.285723,0.37284,0.393917
9,0.882694,0.59845,0.39179,0.484944


## Missing Data

In [21]:
df2=df[df<.5]
print(df2)

          A         B         C         D
0       NaN       NaN       NaN  0.453443
1  0.231304  0.012980  0.102424  0.176636
2  0.323899  0.481525  0.090690       NaN
3  0.247167       NaN  0.032647  0.440945
4  0.084348  0.402155  0.257733       NaN
5  0.440435  0.498661       NaN  0.247461
6  0.022900       NaN       NaN       NaN
7  0.115451  0.036736       NaN  0.060111
8  0.397987  0.285723  0.372840  0.393917
9       NaN       NaN  0.391790  0.484944


**Drop all rows which have NaN **


In [22]:
df2.dropna()

Unnamed: 0,A,B,C,D
1,0.231304,0.01298,0.102424,0.176636
8,0.397987,0.285723,0.37284,0.393917


**Replace NaN with other values **


In [23]:
df2.fillna(value=1)

Unnamed: 0,A,B,C,D
0,1.0,1.0,1.0,0.453443
1,0.231304,0.01298,0.102424,0.176636
2,0.323899,0.481525,0.09069,1.0
3,0.247167,1.0,0.032647,0.440945
4,0.084348,0.402155,0.257733,1.0
5,0.440435,0.498661,1.0,0.247461
6,0.0229,1.0,1.0,1.0
7,0.115451,0.036736,1.0,0.060111
8,0.397987,0.285723,0.37284,0.393917
9,1.0,1.0,0.39179,0.484944


## Statistical Operations

**Taking mean **

In [24]:
df.mean()

A    0.354575
B    0.432684
C    0.384576
D    0.440849
dtype: float64

**Mean in other axis **

In [25]:
df.mean(1)

0    0.660336
1    0.130836
2    0.447198
3    0.399998
4    0.355412
5    0.439094
6    0.460057
7    0.186694
8    0.362617
9    0.589470
dtype: float64