### Introduction to Pandas (Python DataFrames)

In [1]:
# how to create a dataframe
import pandas as pd
import numpy  as np
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
print(df)
print(df.shape)
print(df.index)
df.columns=["a","b","c"]
print(df.columns.values)
print(df)
indexlist = ["row_"+str(i) for i in range(0,2)]
df.index = indexlist
print(df)

   0  1  2
0  1  2  3
1  4  5  6
(2, 3)
RangeIndex(start=0, stop=2, step=1)
['a' 'b' 'c']
   a  b  c
0  1  2  3
1  4  5  6
       a  b  c
row_0  1  2  3
row_1  4  5  6


In [2]:
# how to access a row or column by name
print(df.loc['row_1',:])
print(df.loc[:,'b'])
# or by index
print(df.iloc[1,:])
print(df.iloc[:,1])

a    4
b    5
c    6
Name: row_1, dtype: int32
row_0    2
row_1    5
Name: b, dtype: int32
a    4
b    5
c    6
Name: row_1, dtype: int32
row_0    2
row_1    5
Name: b, dtype: int32


In [3]:
# accessing individual elements
# by row and column name
print(df.loc['row_1','b'])
# or by index
print(df.iloc[0,1])

5
2


In [4]:
# adding a column
df['d']=[89,90]
print(df)

       a  b  c   d
row_0  1  2  3  89
row_1  4  5  6  90


In [9]:
# adding a row
one=pd.DataFrame(np.array([[33,34,35,36]]))
one.columns=df.columns
one.index=["row_2"]
df2=pd.concat([df,one])
print(df2)

        a   b   c   d
row_0   1   2   3  89
row_1   4   5   6  90
row_2  33  34  35  36


In [72]:
# dropping columns
df2.loc[:,'e'] = pd.Series(['51', '61','71'], index=df2.index)
df2.drop('c', axis=1, inplace=True)
print(df2)
df2.drop(df2.columns[1],axis=1,inplace=True)
print(df2)

        a   b   d   e
row_0   1   2  89  51
row_1   4   5  90  61
row_2  33  34  36  71
        a   d   e
row_0   1  89  51
row_1   4  90  61
row_2  33  36  71


In [74]:
# dropping rows
print(df2)
df3=df2.drop(df2.index[1])
print(df3)

        a   d   e
row_0   1  89  51
row_1   4  90  61
row_2  33  36  71
        a   d   e
row_0   1  89  51
row_2  33  36  71


In [75]:
# renaming columns and index
df3.columns=['A','B','C']
df3.index=[0,1]
print(df3)

    A   B   C
0   1  89  51
1  33  36  71


In [89]:
z = pd.DataFrame(np.array([[1, 2, 3],[4, 5, 6],[7,8,9]]))
z.columns=["a","b","c"]
z.index = [i for i in range(0,3)]
print(z)
# select rows based on selected elements of a column
sel = z['a']>=4
z[sel]

   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9


Unnamed: 0,a,b,c
1,4,5,6
2,7,8,9


In [151]:
# create a data frame with NaN's
df = pd.DataFrame(index=range(0,4),columns=['A','B'], dtype='float')
print(df)
# create a data frame with value 1.0
df = pd.DataFrame(1,index=range(0,4),columns=['A','B'], dtype='float')
# set row 1 to [2,3]
df.loc[1]=[2,3]
print(df)
# and apply a lambda to multiply all values with 2
df.apply(lambda x: x*2)

    A   B
0 NaN NaN
1 NaN NaN
2 NaN NaN
3 NaN NaN
     A    B
0  1.0  1.0
1  2.0  3.0
2  1.0  1.0
3  1.0  1.0


Unnamed: 0,A,B
0,2.0,2.0
1,4.0,6.0
2,2.0,2.0
3,2.0,2.0


In [90]:
# writing a data frame to a (tab-delimited) file
LETTERS = [chr(i) for i in range(65,65+26)]
# letters = [chr(i) for i in range(97,97+26)]
# print(LETTERS)
# print(letters)
x = np.random.random((100,26))
z = pd.DataFrame(data=x, columns=LETTERS)
# print(z)
z.to_csv('z100x26.txt', sep='\t')


           A         B         C         D         E         F         G  \
0   0.278071  0.740804  0.925906  0.367537  0.214324  0.262283  0.543779   
1   0.679101  0.353416  0.965890  0.288304  0.836071  0.214951  0.444700   
2   0.945715  0.279901  0.438485  0.224500  0.171580  0.516699  0.892480   
3   0.282437  0.293052  0.146134  0.290180  0.268977  0.566643  0.558605   
4   0.589821  0.273693  0.286441  0.015402  0.123820  0.817337  0.282559   
5   0.392515  0.227871  0.530259  0.946504  0.907158  0.706957  0.546290   
6   0.372466  0.036393  0.406320  0.402558  0.151619  0.022904  0.560205   
7   0.988781  0.988428  0.307309  0.357146  0.343865  0.498124  0.121418   
8   0.283421  0.718084  0.244823  0.492925  0.854933  0.260757  0.058170   
9   0.074840  0.267225  0.794192  0.043674  0.477426  0.684972  0.755390   
10  0.805960  0.937869  0.322819  0.199484  0.822056  0.873693  0.072740   
11  0.048046  0.894130  0.780798  0.399268  0.593345  0.539359  0.505126   
12  0.139786

In [98]:
# reading a dataframe from a file
mydf = pd.read_csv('z100x26.txt',sep="\t")
mydf.shape

(100, 27)