In [1]:
import numpy as np
import pandas as pd

### Renaming a column
[source](https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas)

In [2]:
df = pd.DataFrame({'$a':[1,2], '$b': [10,20]})
df

Unnamed: 0,$a,$b
0,1,10
1,2,20


In [3]:
df = df.rename(columns={'$a': 'a', '$b': 'b'})
df

Unnamed: 0,a,b
0,1,10
1,2,20


### Selecting rows based on column value
[source](https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas)

In [4]:
df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),
                   'B': 'one one two three two two one three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
df

Unnamed: 0,A,B,C,D
0,foo,one,0,0
1,bar,one,1,2
2,foo,two,2,4
3,bar,three,3,6
4,foo,two,4,8
5,bar,two,5,10
6,foo,one,6,12
7,foo,three,7,14


In [5]:
df.loc[df['A'] == 'foo']

Unnamed: 0,A,B,C,D
0,foo,one,0,0
2,foo,two,2,4
4,foo,two,4,8
6,foo,one,6,12
7,foo,three,7,14


In [6]:
df.loc[df['B'].isin(['one','three'])]

Unnamed: 0,A,B,C,D
0,foo,one,0,0
1,bar,one,1,2
3,bar,three,3,6
6,foo,one,6,12
7,foo,three,7,14


### Iterating rows of a dataframe
[source](https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas)

In [7]:
df = pd.DataFrame([{'c1':10, 'c2':100}, {'c1':11,'c2':110}, {'c1':12,'c2':120}])
df

Unnamed: 0,c1,c2
0,10,100
1,11,110
2,12,120


In [8]:
for index, row in df.iterrows():
    print(row['c1'], row['c2'])

10 100
11 110
12 120


### Delete specific column of a dataframe
[source](https://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe-by-column-name)

In [9]:
df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),
                   'B': 'one one two three two two one three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
df

Unnamed: 0,A,B,C,D
0,foo,one,0,0
1,bar,one,1,2
2,foo,two,2,4
3,bar,three,3,6
4,foo,two,4,8
5,bar,two,5,10
6,foo,one,6,12
7,foo,three,7,14


In [10]:
df = df.drop(['A'], 1)
df

Unnamed: 0,B,C,D
0,one,0,0
1,one,1,2
2,two,2,4
3,three,3,6
4,two,4,8
5,two,5,10
6,one,6,12
7,three,7,14


### Get dataframe columns as list
[source](https://stackoverflow.com/questions/19482970/get-list-from-pandas-dataframe-column-headers)

In [11]:
df = pd.DataFrame([{'c1':10, 'c2':100}, {'c1':11,'c2':110}, {'c1':12,'c2':120}])
df

Unnamed: 0,c1,c2
0,10,100
1,11,110
2,12,120


In [12]:
list(df.columns.values)

['c1', 'c2']

In [13]:
df.columns.to_list()

['c1', 'c2']

### Adding new column to existing DataFrame
[source](https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas)

In [14]:
df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),
                   'B': 'one one two three two two one three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
df

Unnamed: 0,A,B,C,D
0,foo,one,0,0
1,bar,one,1,2
2,foo,two,2,4
3,bar,three,3,6
4,foo,two,4,8
5,bar,two,5,10
6,foo,one,6,12
7,foo,three,7,14


In [15]:
df['E'] = pd.Series(np.random.randn(len(df)), index=df.index)
df

Unnamed: 0,A,B,C,D,E
0,foo,one,0,0,-0.316905
1,bar,one,1,2,0.939153
2,foo,two,2,4,-0.873642
3,bar,three,3,6,1.223048
4,foo,two,4,8,-2.210357
5,bar,two,5,10,0.393299
6,foo,one,6,12,-0.423915
7,foo,three,7,14,-1.60596


In [16]:
df['F'] = 'new column'
df

Unnamed: 0,A,B,C,D,E,F
0,foo,one,0,0,-0.316905,new column
1,bar,one,1,2,0.939153,new column
2,foo,two,2,4,-0.873642,new column
3,bar,three,3,6,1.223048,new column
4,foo,two,4,8,-2.210357,new column
5,bar,two,5,10,0.393299,new column
6,foo,one,6,12,-0.423915,new column
7,foo,three,7,14,-1.60596,new column


### Selecting multiple columns in Dataframe
[source](https://stackoverflow.com/questions/11285613/selecting-multiple-columns-in-a-pandas-dataframe)

In [17]:
df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),
                   'B': 'one one two three two two one three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
df

Unnamed: 0,A,B,C,D
0,foo,one,0,0
1,bar,one,1,2
2,foo,two,2,4
3,bar,three,3,6
4,foo,two,4,8
5,bar,two,5,10
6,foo,one,6,12
7,foo,three,7,14


In [18]:
df[['A', 'B']]

Unnamed: 0,A,B
0,foo,one
1,bar,one
2,foo,two
3,bar,three
4,foo,two
5,bar,two
6,foo,one
7,foo,three


In [19]:
df_new = df[['A', 'B']]
df_new

Unnamed: 0,A,B
0,foo,one
1,bar,one
2,foo,two
3,bar,three
4,foo,two
5,bar,two
6,foo,one
7,foo,three


### Get the row count of Dataframe
[source](https://stackoverflow.com/questions/15943769/how-do-i-get-the-row-count-of-a-pandas-dataframe)

In [20]:
df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),
                   'B': 'one one two three two two one three'.split(),
                   'C': np.arange(8), 'D': np.arange(8) * 2})
df

Unnamed: 0,A,B,C,D
0,foo,one,0,0
1,bar,one,1,2
2,foo,two,2,4
3,bar,three,3,6
4,foo,two,4,8
5,bar,two,5,10
6,foo,one,6,12
7,foo,three,7,14


In [21]:
df.shape

(8, 4)

In [22]:
len(df.index)

8

### Add one row to Dataframe
[source](https://stackoverflow.com/questions/10715965/add-one-row-to-pandas-dataframe)

In [23]:
df = pd.DataFrame(columns=['lib', 'qty1', 'qty2'])
df

Unnamed: 0,lib,qty1,qty2


In [24]:
for i in range(5):
    df.loc[i] = [np.random.randint(-1,1) for n in range(3)]
df

Unnamed: 0,lib,qty1,qty2
0,-1,-1,-1
1,-1,0,0
2,-1,-1,0
3,-1,0,0
4,-1,-1,-1


### change the order of DataFrame columns
[source](https://stackoverflow.com/questions/13148429/how-to-change-the-order-of-dataframe-columns)

In [25]:
df = pd.DataFrame(np.random.rand(10, 5))
df['mean'] = df.mean(1)
df

Unnamed: 0,0,1,2,3,4,mean
0,0.422634,0.325947,0.049343,0.343954,0.464668,0.321309
1,0.430274,0.984123,0.818809,0.807032,0.273269,0.662701
2,0.930127,0.943614,0.04518,0.679608,0.988362,0.717378
3,0.347032,0.33329,0.174626,0.8176,0.458351,0.42618
4,0.125476,0.5111,0.075933,0.604488,0.301831,0.323766
5,0.258546,0.940074,0.899509,0.971707,0.903345,0.794636
6,0.633083,0.329916,0.974874,0.836983,0.626724,0.680316
7,0.83977,0.50992,0.447709,0.452522,0.339687,0.517922
8,0.206143,0.333361,0.793817,0.522222,0.866288,0.544366
9,0.605034,0.140023,0.639174,0.192502,0.766322,0.468611


In [26]:
cols = df.columns.tolist()
cols

[0, 1, 2, 3, 4, 'mean']

In [27]:
cols = cols[-1:] + cols[:-1]
cols

['mean', 0, 1, 2, 3, 4]

In [28]:
df = df[cols]
df

Unnamed: 0,mean,0,1,2,3,4
0,0.321309,0.422634,0.325947,0.049343,0.343954,0.464668
1,0.662701,0.430274,0.984123,0.818809,0.807032,0.273269
2,0.717378,0.930127,0.943614,0.04518,0.679608,0.988362
3,0.42618,0.347032,0.33329,0.174626,0.8176,0.458351
4,0.323766,0.125476,0.5111,0.075933,0.604488,0.301831
5,0.794636,0.258546,0.940074,0.899509,0.971707,0.903345
6,0.680316,0.633083,0.329916,0.974874,0.836983,0.626724
7,0.517922,0.83977,0.50992,0.447709,0.452522,0.339687
8,0.544366,0.206143,0.333361,0.793817,0.522222,0.866288
9,0.468611,0.605034,0.140023,0.639174,0.192502,0.766322


In [29]:
df = df[[0, 'mean', 1, 2, 3]]
df

Unnamed: 0,0,mean,1,2,3
0,0.422634,0.321309,0.325947,0.049343,0.343954
1,0.430274,0.662701,0.984123,0.818809,0.807032
2,0.930127,0.717378,0.943614,0.04518,0.679608
3,0.347032,0.42618,0.33329,0.174626,0.8176
4,0.125476,0.323766,0.5111,0.075933,0.604488
5,0.258546,0.794636,0.940074,0.899509,0.971707
6,0.633083,0.680316,0.329916,0.974874,0.836983
7,0.83977,0.517922,0.50992,0.447709,0.452522
8,0.206143,0.544366,0.333361,0.793817,0.522222
9,0.605034,0.468611,0.140023,0.639174,0.192502


### Change data type of columns
[source](https://stackoverflow.com/questions/15891038/change-data-type-of-columns-in-pandas)

In [30]:
df = pd.DataFrame([['a', '1.2', '4.2'], ['b', '70', '0.03'], ['x', '5', '0']])
print(df.dtypes)
df

0    object
1    object
2    object
dtype: object


Unnamed: 0,0,1,2
0,a,1.2,4.2
1,b,70.0,0.03
2,x,5.0,0.0


In [31]:
df[1] = pd.to_numeric(df[1])
print(df.dtypes)
df

0     object
1    float64
2     object
dtype: object


Unnamed: 0,0,1,2
0,a,1.2,4.2
1,b,70.0,0.03
2,x,5.0,0.0


In [32]:
df[2] = df[2].astype(float)
print(df.dtypes)
df

0     object
1    float64
2    float64
dtype: object


Unnamed: 0,0,1,2
0,a,1.2,4.2
1,b,70.0,0.03
2,x,5.0,0.0


### drop rows of Pandas DataFrame whose value in certain columns is NaN
[source](https://stackoverflow.com/questions/13413590/how-to-drop-rows-of-pandas-dataframe-whose-value-in-certain-columns-is-nan)

In [33]:
df = pd.DataFrame(np.random.randn(10,3))
df.iloc[::2,0] = np.nan; df.iloc[::4,1] = np.nan; df.iloc[::3,2] = np.nan;
df

Unnamed: 0,0,1,2
0,,,
1,-0.937624,1.197154,1.050182
2,,0.33306,-0.173356
3,-0.584068,0.49228,
4,,,-1.563644
5,-0.626405,0.738002,0.655903
6,,0.742421,
7,-0.781651,-0.946265,-0.79095
8,,,-1.96169
9,-1.401922,-0.591301,


In [34]:
# drop all rows that have any NaN values
df.dropna()

Unnamed: 0,0,1,2
1,-0.937624,1.197154,1.050182
5,-0.626405,0.738002,0.655903
7,-0.781651,-0.946265,-0.79095


In [35]:
# drop only if ALL columns are NaN
df.dropna(how='all')

Unnamed: 0,0,1,2
1,-0.937624,1.197154,1.050182
2,,0.33306,-0.173356
3,-0.584068,0.49228,
4,,,-1.563644
5,-0.626405,0.738002,0.655903
6,,0.742421,
7,-0.781651,-0.946265,-0.79095
8,,,-1.96169
9,-1.401922,-0.591301,


In [36]:
# Drop row if it does not have at least two values that are **not** NaN
df.dropna(thresh=2)

Unnamed: 0,0,1,2
1,-0.937624,1.197154,1.050182
2,,0.33306,-0.173356
3,-0.584068,0.49228,
5,-0.626405,0.738002,0.655903
7,-0.781651,-0.946265,-0.79095
9,-1.401922,-0.591301,


In [37]:
# Drop only if NaN in specific column
df.dropna(subset=[1])

Unnamed: 0,0,1,2
1,-0.937624,1.197154,1.050182
2,,0.33306,-0.173356
3,-0.584068,0.49228,
5,-0.626405,0.738002,0.655903
6,,0.742421,
7,-0.781651,-0.946265,-0.79095
9,-1.401922,-0.591301,
