In [1]:
import pandas as pd
import numpy as np

## Data Loading & Writing Data

In [2]:
df = pd.read_csv('example.csv')

In [3]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo
3,13,14,15,16,test
4,5,6,7,8,world
5,13,14,15,16,test
6,5,6,7,8,world


### if you don't want column header

In [5]:
pd.read_csv('example.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo
4,13,14,15,16,test
5,5,6,7,8,world
6,13,14,15,16,test
7,5,6,7,8,world


### if you want to put your own column header

In [6]:
pd.read_csv('example.csv', names=['one','two','three','four','message'])

Unnamed: 0,one,two,three,four,message
0,a,b,c,d,message
1,1,2,3,4,hello
2,5,6,7,8,world
3,9,10,11,12,foo
4,13,14,15,16,test
5,5,6,7,8,world
6,13,14,15,16,test
7,5,6,7,8,world


### skip rows

In [7]:
pd.read_csv('example.csv', skiprows=[2,4])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,9,10,11,12,foo
2,5,6,7,8,world
3,13,14,15,16,test
4,5,6,7,8,world


In [8]:
pd.read_csv('example.csv', nrows=2)

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world


In [9]:
df.columns

Index(['a', 'b', 'c', 'd', 'message'], dtype='object')

In [10]:
df['message']

0    hello
1    world
2      foo
3     test
4    world
5     test
6    world
Name: message, dtype: object

In [11]:
type(df['message'])

pandas.core.series.Series

In [12]:
df.describe()

Unnamed: 0,a,b,c,d
count,7.0,7.0,7.0,7.0
mean,7.285714,8.285714,9.285714,10.285714
std,4.535574,4.535574,4.535574,4.535574
min,1.0,2.0,3.0,4.0
25%,5.0,6.0,7.0,8.0
50%,5.0,6.0,7.0,8.0
75%,11.0,12.0,13.0,14.0
max,13.0,14.0,15.0,16.0


In [13]:
df['message'].value_counts()

world    3
test     2
foo      1
hello    1
Name: message, dtype: int64

In [14]:
df['c'].max()

15

In [15]:
df['c'].min()

3

## Handling Missing Values

In [16]:
df2 = pd.read_csv('example2.csv')

In [17]:
df2

Unnamed: 0,a,b,c,d,message
0,1.0,10.0,3.0,7.0,hello
1,5.0,6.0,,8.0,world
2,,,,,foo
3,13.0,,15.0,16.0,test


In [18]:
df2.isnull()

Unnamed: 0,a,b,c,d,message
0,False,False,False,False,False
1,False,False,True,False,False
2,True,True,True,True,False
3,False,True,False,False,False


In [19]:
df2.isna()

Unnamed: 0,a,b,c,d,message
0,False,False,False,False,False
1,False,False,True,False,False
2,True,True,True,True,False
3,False,True,False,False,False


In [20]:
df2.notnull()

Unnamed: 0,a,b,c,d,message
0,True,True,True,True,True
1,True,True,False,True,True
2,False,False,False,False,True
3,True,False,True,True,True


In [21]:
df2.notna()

Unnamed: 0,a,b,c,d,message
0,True,True,True,True,True
1,True,True,False,True,True
2,False,False,False,False,True
3,True,False,True,True,True


In [22]:
df2.dropna()

Unnamed: 0,a,b,c,d,message
0,1.0,10.0,3.0,7.0,hello


In [23]:
df2.dropna(how='all')

Unnamed: 0,a,b,c,d,message
0,1.0,10.0,3.0,7.0,hello
1,5.0,6.0,,8.0,world
2,,,,,foo
3,13.0,,15.0,16.0,test


In [24]:
data = pd.DataFrame([
    [1,3,6,7],
    [2,np.nan,5,np.nan],
    [np.nan,np.nan,np.nan,np.nan],
    [np.nan,6.4,2.8,4]
])

In [25]:
data

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,,5.0,
2,,,,
3,,6.4,2.8,4.0


In [26]:
data.dropna()

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0


In [27]:
data.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,,5.0,
3,,6.4,2.8,4.0


In [28]:
data.dropna(axis=1)

0
1
2
3


In [29]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,,5.0,
2,,,,
3,,6.4,2.8,4.0


In [30]:
data.fillna(0)

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,0.0,5.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,6.4,2.8,4.0


In [31]:
data

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,,5.0,
2,,,,
3,,6.4,2.8,4.0


In [32]:
data.fillna({0:2,1:2.5,2:6.4,3:3.33})

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,2.5,5.0,3.33
2,2.0,2.5,6.4,3.33
3,2.0,6.4,2.8,4.0


In [33]:
data

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,,5.0,
2,,,,
3,,6.4,2.8,4.0


In [34]:
data.fillna({0:2,1:2.5,2:6.4,3:3.33}, inplace=True)

In [35]:
data

Unnamed: 0,0,1,2,3
0,1.0,3.0,6.0,7.0
1,2.0,2.5,5.0,3.33
2,2.0,2.5,6.4,3.33
3,2.0,6.4,2.8,4.0


In [36]:
ds = pd.Series([1,np.nan,3.5,np.nan,7,np.nan,11])

In [37]:
ds

0     1.0
1     NaN
2     3.5
3     NaN
4     7.0
5     NaN
6    11.0
dtype: float64

In [38]:
ds.fillna(ds.mean())

0     1.000
1     5.625
2     3.500
3     5.625
4     7.000
5     5.625
6    11.000
dtype: float64

In [39]:
ds

0     1.0
1     NaN
2     3.5
3     NaN
4     7.0
5     NaN
6    11.0
dtype: float64

In [40]:
ds.fillna(ds.mean(), inplace=True)

In [41]:
ds

0     1.000
1     5.625
2     3.500
3     5.625
4     7.000
5     5.625
6    11.000
dtype: float64