In [1]:
# all imports go at the top, even though python is a very open language
import pandas as pd
import numpy as np

## Series

In [2]:
# Normal series and series with custom index
s1 = pd.Series([1,2,3])
s2 = pd.Series([1,2,3, np.nan], index=list("abcd"))
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [3]:
# changing dtype: astype
s2.astype("str")
print(s2)

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [4]:
# shape, size, dtype, index, values
print(s1.shape,s2.shape)
print(s1.size,s2.size)
print(s1.dtype,s2.dtype)
print(s1.index,s2.index)
print(s1.values,s2.values)
print(type(s2.values))

(3,) (4,)
3 4
int64 float64
RangeIndex(start=0, stop=3, step=1) Index(['a', 'b', 'c', 'd'], dtype='object')
[1 2 3] [ 1.  2.  3. nan]
<class 'numpy.ndarray'>


In [5]:
# aggregate operations: min, max, unique, sum etc.
print(s1.min(),s2.min())
print(s1.max(),s2.max())
print(s1.unique(),s2.unique())
print(s1.sum(),s2.sum())

1 1.0
3 3.0
[1 2 3] [ 1.  2.  3. nan]
6 6.0


In [6]:
print(s1+s2)
print(s1-s2)
print(s1*s2)

0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64
0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64
0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64


In [7]:
# arithmetic operations(new copy) and broadcast like numpy
print(s1.copy()*2)

0    2
1    4
2    6
dtype: int64


In [8]:
# indexing and slicing
print(s1)

print()
print(s1[1:2])

print()
print(s1[[1,2]])

print()
sc=s1%3==0
print(s1[sc])

0    1
1    2
2    3
dtype: int64

1    2
dtype: int64

1    2
2    3
dtype: int64

2    3
dtype: int64


In [9]:
# result of operations as index
sc=s1.copy()
sc[0:]=100
print(sc)
sc[0:1]=200
print(sc)

0    100
1    100
2    100
dtype: int64
0    200
1    100
2    100
dtype: int64


In [10]:
# frequency : value_counts
print(s1.value_counts())

3    1
2    1
1    1
dtype: int64


## DataFrame

In [45]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [46]:
# shape, size, dtype, index, columns, unique, T
print(df1.shape,df2.shape)
print(df1.size,df2.size)
print(df1.dtypes,df2.dtypes)
print(df1.index,df2.index)
print(df1.columns,df2.columns)
print(df1.Age.unique())
print(df2.T)

(10, 3) (4, 2)
30 8
Name       object
Age       float64
Gender    float64
dtype: object Name      object
Number     int64
dtype: object
RangeIndex(start=0, stop=10, step=1) Int64Index([0, 11, 2, 3], dtype='int64')
Index(['Name', 'Age', 'Gender'], dtype='object') Index(['Name', 'Number'], dtype='object')
[ 0.  1.  2.  3. nan]
            0         11       2          3 
Name    Gaurav  Abhiskek  Krishna  Abhishek2
Number       1         2        3          4


In [62]:
# Rename index and columns on df2.
# Inplace vs normal operation
df2.rename(columns={"Name":'Employee',"Number":'EmpId'},inplace=True)
df2.rename(index={0:'a', 11:'b', 2:'c', 3:'d'},inplace=True)
df2

Unnamed: 0,Employee,EmpId
a,Gaurav,1
b,Abhiskek,2
c,Krishna,3
d,Abhishek2,4


In [75]:
# Indexing columns directly and via index.
print(df2["Employee"])
print(df2.Employee)

a       Gaurav
b     Abhiskek
c      Krishna
d    Abhishek2
Name: Employee, dtype: object
a       Gaurav
b     Abhiskek
c      Krishna
d    Abhishek2
Name: Employee, dtype: object


In [76]:
# Series in a DataFrame
print(type(df2.Employee))

<class 'pandas.core.series.Series'>


In [81]:
# describe, top/bottom rows of data
print(df1.describe())

print()
print(df1.head())

print()
print(df1.tail())

            Age    Gender
count  9.000000  9.000000
mean   1.555556  0.333333
std    0.881917  0.500000
min    0.000000  0.000000
25%    1.000000  0.000000
50%    2.000000  0.000000
75%    2.000000  1.000000
max    3.000000  1.000000

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0

     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0


In [106]:
# indexing and slicing
# loc, iloc [rows, cols]
print(df1.loc[:,"Age"])
print()
print(df2.loc['b':'b'])
print()
print(df1.iloc[3:,[0,2]])
print()
print(df2.iloc[:1,[0,1]])

0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
5    2.0
6    3.0
7    1.0
8    NaN
9    2.0
Name: Age, dtype: float64

   Employee  EmpId
b  Abhiskek      2

        Name  Gender
3  Abhishek2     0.0
4   Harshita     1.0
5       Joey     1.0
6     Shweta     1.0
7       na-1     NaN
8       na-2     0.0
9        NaN     0.0

  Employee  EmpId
a   Gaurav      1


In [18]:
print(df1)
print(df1.loc[1:1])
df1["NameLen"]=df1.Name.str.len()
print(dir(df1.reindex))
df1.reindex(columns=["Name","Age"])
print(df1)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0
        Name  Age  Gender
1  Abhishek1  1.0     0.0
['__call__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__func__', '__ge__', '__get__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__self__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__wrapped__']
        Name  Age  Gender  NameLen
0     Gaurav  0.0     0.0      6.0
1  Abhishek1  1.0     0.0      9.0
2    Krishna  2.0     0.0      7.0
3  Abhishek2  2.0     0.0      9.0
4   Harshita  1.0     1.0      8.0
5       Joey  2.0     1.0      4.0
6     Shweta  3.0     1.0      6.0
7       na-1  1.

In [19]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...
print(df1)
print()
print(df1.NameLen.max())
print(df1.NameLen.min())
print(df1.count())

        Name  Age  Gender  NameLen
0     Gaurav  0.0     0.0      6.0
1  Abhishek1  1.0     0.0      9.0
2    Krishna  2.0     0.0      7.0
3  Abhishek2  2.0     0.0      9.0
4   Harshita  1.0     1.0      8.0
5       Joey  2.0     1.0      4.0
6     Shweta  3.0     1.0      6.0
7       na-1  1.0     NaN      4.0
8       na-2  NaN     0.0      4.0
9        NaN  2.0     0.0      NaN

9.0
4.0
Name       9
Age        9
Gender     9
NameLen    9
dtype: int64


In [107]:
# arithmetic operations
df2.EmpId*2

a    2
b    4
c    6
d    8
Name: EmpId, dtype: int64

In [121]:
# Na Values: isna, fillna, dropna
print(df1.Name.isna())
print()
print(df1.Name.fillna(0))
print()
print(df1.dropna(thresh=2))

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9     True
Name: Name, dtype: bool

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9            0
Name: Name, dtype: object

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [127]:
# any, all
print(df1.all())
print()
print(df1.any())

Name       True
Age       False
Gender    False
dtype: bool

Name      True
Age       True
Gender    True
dtype: bool


In [132]:
# ordering data, sort_values
print(df1.sort_values(by=["Age","Gender"]))
print(df1.sort_index(ascending=False))

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
4   Harshita  1.0     1.0
7       na-1  1.0     NaN
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
9        NaN  2.0     0.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
8       na-2  NaN     0.0
        Name  Age  Gender
9        NaN  2.0     0.0
8       na-2  NaN     0.0
7       na-1  1.0     NaN
6     Shweta  3.0     1.0
5       Joey  2.0     1.0
4   Harshita  1.0     1.0
3  Abhishek2  2.0     0.0
2    Krishna  2.0     0.0
1  Abhishek1  1.0     0.0
0     Gaurav  0.0     0.0


In [138]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4))
print(tmp)
print(tmp.apply(np.sum))
print()
print(tmp.applymap(lambda x:x**2))

    0   1   2   3
0  17  13  13  15
1  16  17  19  18
2  14  16  14  17
3  10  14  18  19
4  11  19  16  14
0    68
1    79
2    80
3    83
dtype: int64

     0    1    2    3
0  289  169  169  225
1  256  289  361  324
2  196  256  196  289
3  100  196  324  361
4  121  361  256  196


In [144]:
# str submodule (replace etc)
print(df2)
print()
print(df2.Employee.str.replace("a",'='))
print()
print(df2.Employee.replace("a",'-'))

    Employee  EmpId
a     Gaurav      1
b   Abhiskek      2
c    Krishna      3
d  Abhishek2      4

a       G=ur=v
b     Abhiskek
c      Krishn=
d    Abhishek2
Name: Employee, dtype: object

a       Gaurav
b     Abhiskek
c      Krishna
d    Abhishek2
Name: Employee, dtype: object


In [150]:
# load and save data
tmp.to_csv('csv_files/tmp.csv')
t=pd.read_csv('csv_files/tmp.csv',index_col=0)
print(t)

    0   1   2   3
0  17  13  13  15
1  16  17  19  18
2  14  16  14  17
3  10  14  18  19
4  11  19  16  14


In [155]:
# groups: groupby
df1.groupby(df1.Gender).count()

Unnamed: 0_level_0,Name,Age
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5,5
1.0,3,3


In [158]:
# groups and aggregates
df1.groupby("Gender").groups

{0.0: Int64Index([0, 1, 2, 3, 8, 9], dtype='int64'),
 1.0: Int64Index([4, 5, 6], dtype='int64')}