In [2]:
import pandas as pd
import numpy.random as rand
import numpy as np

In [3]:
test_df = pd.DataFrame(rand.randn(4,5), columns=['a','b','c','d','e'])
test_df


Unnamed: 0,a,b,c,d,e
0,-1.269977,-1.17737,1.158399,-0.016085,0.203176
1,-0.519371,-0.260523,-0.215049,-0.487501,1.459452
2,0.428743,0.699861,-0.112447,0.096921,-0.79886
3,-2.234911,-0.776609,-0.083039,0.412606,-0.8475


In [89]:
test_df.columns

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

How to create new columns derived from existing columns in Pandas?

In [90]:
test_df.loc[:, 'f'] = test_df.loc[:, 'a'].values + test_df.loc[:, 'e'].values
test_df

Unnamed: 0,a,b,c,d,e,f
0,-0.898224,-1.221238,0.972089,1.408249,0.305752,-0.592472
1,0.326077,-0.664637,-1.171334,0.581604,-0.095679,0.230398
2,1.263796,0.225799,0.359943,-0.235585,-1.416931,-0.153135
3,0.996124,0.641902,-0.937678,-0.968739,0.235575,1.231699


A column in a df has boolean True/False values, but for further calculations, we need 1/0 representation. How would you transform it?

In [91]:
test_df['bool'] = test_df['b'].values > 0
test_df

Unnamed: 0,a,b,c,d,e,f,bool
0,-0.898224,-1.221238,0.972089,1.408249,0.305752,-0.592472,False
1,0.326077,-0.664637,-1.171334,0.581604,-0.095679,0.230398,False
2,1.263796,0.225799,0.359943,-0.235585,-1.416931,-0.153135,True
3,0.996124,0.641902,-0.937678,-0.968739,0.235575,1.231699,True


In [92]:
test_df.loc[:, 'bool'] = test_df['bool'].astype('int')

Describe how you will get the names of columns of a DataFrame in Pandas

In [93]:
[el for el in test_df.columns]

['a', 'b', 'c', 'd', 'e', 'f', 'bool']

In [94]:
sorted(test_df.columns)

['a', 'b', 'bool', 'c', 'd', 'e', 'f']

How are `iloc()` and `loc()` different?

In [79]:
# returns rows and cols by number
test_df.iloc[1:3,0:2]

# returns values by col name and row index
test_df.loc[:2, ['a', 'e']]

Unnamed: 0,a,e
0,0.817888,-0.985605
1,-1.201152,-0.205126
2,2.479945,-0.360293


 How can you sort the DataFrame?

In [97]:
test_df.sort_values(['a', 'b'])

Unnamed: 0,a,b,c,d,e,f,bool
0,-0.898224,-1.221238,0.972089,1.408249,0.305752,-0.592472,0
1,0.326077,-0.664637,-1.171334,0.581604,-0.095679,0.230398,0
3,0.996124,0.641902,-0.937678,-0.968739,0.235575,1.231699,1
2,1.263796,0.225799,0.359943,-0.235585,-1.416931,-0.153135,1


How can you find the row for which the value of a specific column is max or min?

In [101]:
test_df[test_df['a'] == test_df['a'].max()].index[0]

2

In [102]:
test_df['a'].idxmax()

2

How can you get a list of Pandas DataFrame columns based on data type?

In [108]:
test_df.dtypes[test_df.dtypes == 'int64'].index

Index(['bool'], dtype='object')

In [111]:
sorted(test_df.select_dtypes(['float64']))

['a', 'b', 'c', 'd', 'e', 'f']

How does the groupby() method works in Pandas?

In [115]:
test_df.groupby('bool')['a'].mean()

# SPLIT df into subgroups of rows 
# APPLY function to each subgroup 
# COMBINE the result by the values of the col the df is grouped by

bool
0   -0.286074
1    1.129960
Name: a, dtype: float64

How to split a string column in a DataFrame into two columns?

In [30]:
test_df['str_col'] = np.array('f d  s d  w e  l k'.split('  ')).reshape(-1,1)

In [28]:
test_df

Unnamed: 0,a,b,c,d,e,str_col
0,-1.269977,-1.17737,1.158399,-0.016085,0.203176,f d
1,-0.519371,-0.260523,-0.215049,-0.487501,1.459452,s d
2,0.428743,0.699861,-0.112447,0.096921,-0.79886,w e
3,-2.234911,-0.776609,-0.083039,0.412606,-0.8475,lk


In [18]:
test_df['str_col_2'] = test_df['str_col'].apply(lambda x: x[1])
test_df['str_col'] = test_df['str_col'].apply(lambda x: x[0])

In [22]:
del test_df['str_col_2']

In [32]:
test_df['str_col'].str.split(expand=True)

Unnamed: 0,0,1
0,f,d
1,s,d
2,w,e
3,l,k


How to check whether a Pandas DataFrame is empty?

In [34]:
empt_df = pd.DataFrame(None)

In [44]:
empt_df.empty

True

How would you iterate over rows in a DataFrame in Pandas?

In [45]:
test_df

Unnamed: 0,a,b,c,d,e,str_col
0,-1.269977,-1.17737,1.158399,-0.016085,0.203176,f d
1,-0.519371,-0.260523,-0.215049,-0.487501,1.459452,s d
2,0.428743,0.699861,-0.112447,0.096921,-0.79886,w e
3,-2.234911,-0.776609,-0.083039,0.412606,-0.8475,l k


In [58]:
for n, el in test_df.iterrows():
    print(el['a'], el['str_col'])

-1.2699773996930952 f d
-0.5193712753001253 s d
0.4287425536418248 w e
-2.234911469783308 l k


What are the operations that Pandas Groupby method is based on ?

In [None]:
Split Apply Group

What does describe() percentiles values tell about our data?

In [59]:
test_df.describe()

Unnamed: 0,a,b,c,d,e
count,4.0,4.0,4.0,4.0,4.0
mean,-0.898879,-0.378661,0.186966,0.001485,0.004067
std,1.129794,0.811062,0.65009,0.37307,1.084381
min,-2.234911,-1.17737,-0.215049,-0.487501,-0.8475
25%,-1.511211,-0.8768,-0.138097,-0.133939,-0.81102
50%,-0.894674,-0.518566,-0.097743,0.040418,-0.297842
75%,-0.282343,-0.020427,0.227321,0.175842,0.517245
max,0.428743,0.699861,1.158399,0.412606,1.459452


 Compare the Pandas methods: map(), applymap(), apply()

In [61]:
# replaces values in the column, returnes NaN for values that ar enot in the mapping
test_df.str_col.map({'f d':'val1', 'w e':'val2'})

0    val1
1     NaN
2    val2
3     NaN
Name: str_col, dtype: object

In [64]:
# applies a function to a value from the column selected
test_df.a.apply(lambda x: x+1)

0   -0.269977
1    0.480629
2    1.428743
3   -1.234911
Name: a, dtype: float64

In [66]:
# applies a function to a value from the all columns row by row elementwise
test_df[['a','b','c']].applymap(lambda x: x+1)

Unnamed: 0,a,b,c
0,-0.269977,-0.17737,2.158399
1,0.480629,0.739477,0.784951
2,1.428743,1.699861,0.887553
3,-1.234911,0.223391,0.916961
