In [1]:
# all imports go at the top, even though python is a very open language
import pandas as pd
import numpy as np

In [2]:
print(dir(pd))

['Categorical', 'CategoricalDtype', 'CategoricalIndex', 'DataFrame', 'DateOffset', 'DatetimeIndex', 'DatetimeTZDtype', 'ExcelFile', 'ExcelWriter', 'Float64Index', 'Grouper', 'HDFStore', 'Index', 'IndexSlice', 'Int16Dtype', 'Int32Dtype', 'Int64Dtype', 'Int64Index', 'Int8Dtype', 'Interval', 'IntervalDtype', 'IntervalIndex', 'MultiIndex', 'NaT', 'NamedAgg', 'Period', 'PeriodDtype', 'PeriodIndex', 'RangeIndex', 'Series', 'SparseArray', 'SparseDataFrame', 'SparseDtype', 'SparseSeries', 'Timedelta', 'TimedeltaIndex', 'Timestamp', 'UInt16Dtype', 'UInt32Dtype', 'UInt64Dtype', 'UInt64Index', 'UInt8Dtype', '__builtins__', '__cached__', '__doc__', '__docformat__', '__file__', '__getattr__', '__git_version__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_config', '_hashtable', '_lib', '_libs', '_np_version_under1p14', '_np_version_under1p15', '_np_version_under1p16', '_np_version_under1p17', '_tslib', '_typing', '_version', 'api', 'array', 'arrays', 'bdate_rang

## Series

In [3]:
# Normal series and series with custom index
s1 = pd.Series([1,2,3])
s2 = pd.Series([1,2,3, np.nan], index=list("abcd"))
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [4]:
# changing dtype: astype
r = s1.astype('str')
print(s1)
print()
print(r)

0    1
1    2
2    3
dtype: int64

0    1
1    2
2    3
dtype: object


In [5]:
# shape, size, dtype, index, values
print(s1.shape)
print(s1.size)
print(s1.dtype)
print(s1.index)
print(s1.values)

print()
print(s2.index, type(s2.index))
print(s2.values)

(3,)
3
int64
RangeIndex(start=0, stop=3, step=1)
[1 2 3]

Index(['a', 'b', 'c', 'd'], dtype='object') <class 'pandas.core.indexes.base.Index'>
[ 1.  2.  3. nan]


In [6]:
# aggregate operations: min, max, unique, sum etc.
print(s2.min())
print(s2.max())
print(s2.unique())
print(s2.sum())

1.0
3.0
[ 1.  2.  3. nan]
6.0


In [7]:
# arithmetic operations(new copy) and broadcast like numpy
r = s1 * 10
print(r)

print()
print(s1)

# series and series
r = s1 * s2
print(r)

print()
s3  = pd.Series([10,20,30,40])
print(s3)
print(s1)
r = s1 * s3
print(r)

0    10
1    20
2    30
dtype: int64

0    1
1    2
2    3
dtype: int64
0   NaN
1   NaN
2   NaN
a   NaN
b   NaN
c   NaN
d   NaN
dtype: float64

0    10
1    20
2    30
3    40
dtype: int64
0    1
1    2
2    3
dtype: int64
0    10.0
1    40.0
2    90.0
3     NaN
dtype: float64


In [8]:
# indexing and slicing
print(s1[0])
print(s2['a'])

#  slice
print()
print(s1[0:2])
print(s2['a':'b'])

1
1.0

0    1
1    2
dtype: int64
a    1.0
b    2.0
dtype: float64


In [9]:

s4 = pd.Series([1,2,3, np.nan], index=[1,2,1,3])
print(s4)

print()
print(s4[1])

print()
print(s4.iloc[1])

1    1.0
2    2.0
1    3.0
3    NaN
dtype: float64

1    1.0
1    3.0
dtype: float64

2.0


In [10]:
# iloc: positional index based
print(s2)
print()
print(s2.iloc[0])
print()
print(s2.iloc[::-1])

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64

1.0

d    NaN
c    3.0
b    2.0
a    1.0
dtype: float64


In [11]:
# loc: normal indexing
print(s2)
print()
print(s2.loc["a"])
print()
print(s2.loc['a':'b'])

print()
print(s2.loc['a':'c'])

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64

1.0

a    1.0
b    2.0
dtype: float64

a    1.0
b    2.0
c    3.0
dtype: float64


In [12]:
# result of operations as index
s3[0] = 100
print(s3)

print(id(s3))
s3[:] = s3*-1
print(s3)
print(id(s3))

print()
print(id(s3))
s3 = s3*-1
print(s3)
print(id(s3))

0    100
1     20
2     30
3     40
dtype: int64
2402393341704
0   -100
1    -20
2    -30
3    -40
dtype: int64
2402393341704

2402393341704
0    100
1     20
2     30
3     40
dtype: int64
2402393714760


In [13]:
# frequency : value_counts
print(s2)

r = s2.value_counts()
print(r)

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64
3.0    1
2.0    1
1.0    1
dtype: int64


## DataFrame

In [14]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([["Gaurav",1], ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"]
)
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [15]:
# shape, size, dtypes, index, columns, T
print(df1.shape)

print()
print(df1.size)

print()
print(df1.dtypes)
print(type(df1.dtypes))

print()
print(df1.index)

print()
print(df1.columns)

print()
print(df1.T)

(10, 3)

30

Name       object
Age       float64
Gender    float64
dtype: object
<class 'pandas.core.series.Series'>

RangeIndex(start=0, stop=10, step=1)

Index(['Name', 'Age', 'Gender'], dtype='object')

             0          1        2          3         4     5       6     7  \
Name    Gaurav  Abhishek1  Krishna  Abhishek2  Harshita  Joey  Shweta  na-1   
Age          0          1        2          2         1     2       3     1   
Gender       0          0        0          0         1     1       1   NaN   

           8    9  
Name    na-2  NaN  
Age      NaN    2  
Gender     0    0  


In [16]:
print(type(df1.Name))
print(df1.Name)
print(type(df1.Name.values))
print(df1.Name.values)
print(df1.Name.values.dtype)

<class 'pandas.core.series.Series'>
0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: Name, dtype: object
<class 'numpy.ndarray'>
['Gaurav' 'Abhishek1' 'Krishna' 'Abhishek2' 'Harshita' 'Joey' 'Shweta'
 'na-1' 'na-2' nan]
object


In [17]:
print(df2.Name)

0        Gaurav
11     Abhiskek
2       Krishna
3     Abhishek2
Name: Name, dtype: object


In [18]:
[1,2,3]

[1, 2, 3]

In [19]:
# Rename index and columns on df2.
# Inplace vs normal operation
r = df1.rename({"Age":"A", "Gender":"G"}, axis=1)
print(r)

print()
print(df1)

        Name    A    G
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0
9        NaN  2.0  0.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [20]:
r = df1.rename(lambda s: s.upper(), axis=1)
print(r)

print()
print(df1)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [21]:
r = df1.rename(columns=lambda s: s.upper()) # axis not needed when using index or columns keyword args
print(r)

print()
print(df1)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [22]:
r = df1.rename(columns=lambda s: s.upper(), inplace=True) # axis not needed when using index or columns keyword args
print(r)

print()
print(df1)

None

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [23]:
# Indexing columns directly and via index. 
print(df1.NAME)

print()
print(df1["NAME"]) # used to index columns

print()
print(df1.AGE * 10)

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: NAME, dtype: object

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: NAME, dtype: object

0     0.0
1    10.0
2    20.0
3    20.0
4    10.0
5    20.0
6    30.0
7    10.0
8     NaN
9    20.0
Name: AGE, dtype: float64


In [24]:
# Series in a DataFrame
print(type(df1.NAME))
print(df1.NAME)
print(df1.NAME.unique())

<class 'pandas.core.series.Series'>
0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: NAME, dtype: object
['Gaurav' 'Abhishek1' 'Krishna' 'Abhishek2' 'Harshita' 'Joey' 'Shweta'
 'na-1' 'na-2' nan]


In [25]:
# describe, top/bottom rows of data
print(df1.head(6))

print()
print(df1.tail())

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0

     NAME  AGE  GENDER
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0


In [26]:
# indexing and slicing
# loc, iloc [rows, cols]

In [27]:
print(df1)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [28]:
# loc: [row , col]
print(df1.loc[0]) # 

print()
print(df1.loc[:, "NAME"])

print()
print(df1.loc[3::2, "NAME":"AGE"])

NAME      Gaurav
AGE            0
GENDER         0
Name: 0, dtype: object

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: NAME, dtype: object

        NAME  AGE
3  Abhishek2  2.0
5       Joey  2.0
7       na-1  1.0
9        NaN  2.0


In [29]:
# iloc
print(df1.iloc[0]) # 

print()
print(df1.iloc[:, 0])

print()
print(df1.iloc[3::2, 0:2])

NAME      Gaurav
AGE            0
GENDER         0
Name: 0, dtype: object

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: NAME, dtype: object

        NAME  AGE
3  Abhishek2  2.0
5       Joey  2.0
7       na-1  1.0
9        NaN  2.0


In [30]:
print(df1)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [31]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, mean, sum, ...
print(df1.sum())

print()
print(df1.sum(axis=0))

print()
print(df1.sum(axis=1))

AGE       14.0
GENDER     3.0
dtype: float64

AGE       14.0
GENDER     3.0
dtype: float64

0    0.0
1    1.0
2    2.0
3    2.0
4    2.0
5    3.0
6    4.0
7    1.0
8    0.0
9    2.0
dtype: float64


In [32]:
print(df1.min(axis=1))


0    0.0
1    0.0
2    0.0
3    0.0
4    1.0
5    1.0
6    1.0
7    1.0
8    0.0
9    0.0
dtype: float64


In [33]:
# arithmetic operations
print(df1 * 2)

print()
#print(df1 + 2)

print()
print(df1.AGE + 2)

                 NAME  AGE  GENDER
0        GauravGaurav  0.0     0.0
1  Abhishek1Abhishek1  2.0     0.0
2      KrishnaKrishna  4.0     0.0
3  Abhishek2Abhishek2  4.0     0.0
4    HarshitaHarshita  2.0     2.0
5            JoeyJoey  4.0     2.0
6        ShwetaShweta  6.0     2.0
7            na-1na-1  2.0     NaN
8            na-2na-2  NaN     0.0
9                 NaN  4.0     0.0


0    2.0
1    3.0
2    4.0
3    4.0
4    3.0
5    4.0
6    5.0
7    3.0
8    NaN
9    4.0
Name: AGE, dtype: float64


In [34]:
# Na Values: isna, fillna, dropna
print(df1.isna())

    NAME    AGE  GENDER
0  False  False   False
1  False  False   False
2  False  False   False
3  False  False   False
4  False  False   False
5  False  False   False
6  False  False   False
7  False  False    True
8  False   True   False
9   True  False   False


In [35]:
# any, all
print(df1.isna())

print()
print(df1.isna().any())

print()
print(df1.isna().all())

print()
print(df1.isna().any(axis=1))

    NAME    AGE  GENDER
0  False  False   False
1  False  False   False
2  False  False   False
3  False  False   False
4  False  False   False
5  False  False   False
6  False  False   False
7  False  False    True
8  False   True   False
9   True  False   False

NAME      True
AGE       True
GENDER    True
dtype: bool

NAME      False
AGE       False
GENDER    False
dtype: bool

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8     True
9     True
dtype: bool


In [36]:
print(df1)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [37]:
#  fillna
r =  df1.fillna(method='ffill')
print(r)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     1.0
8       na-2  1.0     0.0
9       na-2  2.0     0.0


In [38]:
r =  df1.fillna(method='bfill')
print(r)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     0.0
8       na-2  2.0     0.0
9        NaN  2.0     0.0


In [39]:
# fill with values
r =  df1.fillna("abcd")
print(r)

print()
print(r.dtypes)

        NAME   AGE GENDER
0     Gaurav     0      0
1  Abhishek1     1      0
2    Krishna     2      0
3  Abhishek2     2      0
4   Harshita     1      1
5       Joey     2      1
6     Shweta     3      1
7       na-1     1   abcd
8       na-2  abcd      0
9       abcd     2      0

NAME      object
AGE       object
GENDER    object
dtype: object


In [40]:
r =  df1.fillna({"NAME":"abcd", "AGE":df1.AGE.max()})
print(r)

print()
print(r.dtypes)

        NAME  AGE  GENDER
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  3.0     0.0
9       abcd  2.0     0.0

NAME       object
AGE       float64
GENDER    float64
dtype: object


In [41]:
df1.dropna() # default order is row wise

Unnamed: 0,NAME,AGE,GENDER
0,Gaurav,0.0,0.0
1,Abhishek1,1.0,0.0
2,Krishna,2.0,0.0
3,Abhishek2,2.0,0.0
4,Harshita,1.0,1.0
5,Joey,2.0,1.0
6,Shweta,3.0,1.0


In [42]:
df1.dropna(axis=1)

0
1
2
3
4
5
6
7
8
9


In [43]:

df1.AGE[0] = np.nan
print(df1)

        NAME  AGE  GENDER
0     Gaurav  NaN     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [44]:
df1.dropna(thresh=9, axis=1) # drop columns which don't have atleast 9 values

Unnamed: 0,NAME,GENDER
0,Gaurav,0.0
1,Abhishek1,0.0
2,Krishna,0.0
3,Abhishek2,0.0
4,Harshita,1.0
5,Joey,1.0
6,Shweta,1.0
7,na-1,
8,na-2,0.0
9,,0.0


In [45]:
df1.dropna(thresh=8, axis=1) # drop columns which don't have atleast 9 values

Unnamed: 0,NAME,AGE,GENDER
0,Gaurav,,0.0
1,Abhishek1,1.0,0.0
2,Krishna,2.0,0.0
3,Abhishek2,2.0,0.0
4,Harshita,1.0,1.0
5,Joey,2.0,1.0
6,Shweta,3.0,1.0
7,na-1,1.0,
8,na-2,,0.0
9,,2.0,0.0


In [46]:
# ordering data, sort_values
df1.sort_values(by="AGE")

Unnamed: 0,NAME,AGE,GENDER
1,Abhishek1,1.0,0.0
4,Harshita,1.0,1.0
7,na-1,1.0,
2,Krishna,2.0,0.0
3,Abhishek2,2.0,0.0
5,Joey,2.0,1.0
9,,2.0,0.0
6,Shweta,3.0,1.0
0,Gaurav,,0.0
8,na-2,,0.0


In [47]:

df1.sort_values(by=["AGE", "NAME"])

Unnamed: 0,NAME,AGE,GENDER
1,Abhishek1,1.0,0.0
4,Harshita,1.0,1.0
7,na-1,1.0,
3,Abhishek2,2.0,0.0
5,Joey,2.0,1.0
2,Krishna,2.0,0.0
9,,2.0,0.0
6,Shweta,3.0,1.0
0,Gaurav,,0.0
8,na-2,,0.0


In [48]:
print(df2)

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [49]:
df2.sort_index()

Unnamed: 0,Name,Number
0,Gaurav,1
2,Krishna,3
3,Abhishek2,4
11,Abhiskek,2


In [50]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4))

In [51]:
# str submodule (replace etc)
df1.NAME.str.upper()

0       GAURAV
1    ABHISHEK1
2      KRISHNA
3    ABHISHEK2
4     HARSHITA
5         JOEY
6       SHWETA
7         NA-1
8         NA-2
9          NaN
Name: NAME, dtype: object

In [52]:
df1.NAME.str.len()

0    6.0
1    9.0
2    7.0
3    9.0
4    8.0
5    4.0
6    6.0
7    4.0
8    4.0
9    NaN
Name: NAME, dtype: float64

In [53]:
print(dir(df1.NAME.str))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__frozen', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_doc_args', '_freeze', '_get_series_list', '_inferred_dtype', '_is_categorical', '_make_accessor', '_orig', '_parent', '_validate', '_wrap_result', 'capitalize', 'casefold', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace',

In [54]:
df1.AGE.str.upper()

AttributeError: Can only use .str accessor with string values!

In [None]:
# load and save data
df = pd.read_csv(r'https://raw.githubusercontent.com/leangaurav/dsc_weekday_2020_09_24/master/CODE/titanic_dataset.csv', index_col=0)
print(df.head())



In [None]:
print(dir(df))
df.to_csv('titanic_copy.csv')

In [61]:
df1

Unnamed: 0,NAME,AGE,GENDER
0,Gaurav,,0.0
1,Abhishek1,1.0,0.0
2,Krishna,2.0,0.0
3,Abhishek2,2.0,0.0
4,Harshita,1.0,1.0
5,Joey,2.0,1.0
6,Shweta,3.0,1.0
7,na-1,1.0,
8,na-2,,0.0
9,,2.0,0.0


In [68]:
r = df1.apply(lambda x: x.isna())  # x will be a series. x.isna() generates a series again
r

Unnamed: 0,NAME,AGE,GENDER
0,False,True,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,True
8,False,True,False
9,True,False,False


In [69]:
# apply , applymap
r = df1.apply(lambda x: x.count())
r

NAME      9
AGE       8
GENDER    9
dtype: int64

In [70]:
r = df1.apply(lambda x: x.count(), axis=1)
r

0    2
1    3
2    3
3    3
4    3
5    3
6    3
7    2
8    2
9    2
dtype: int64

In [72]:
r = df1.applymap(lambda x: type(x))
r

Unnamed: 0,NAME,AGE,GENDER
0,<class 'str'>,<class 'float'>,<class 'float'>
1,<class 'str'>,<class 'float'>,<class 'float'>
2,<class 'str'>,<class 'float'>,<class 'float'>
3,<class 'str'>,<class 'float'>,<class 'float'>
4,<class 'str'>,<class 'float'>,<class 'float'>
5,<class 'str'>,<class 'float'>,<class 'float'>
6,<class 'str'>,<class 'float'>,<class 'float'>
7,<class 'str'>,<class 'float'>,<class 'float'>
8,<class 'str'>,<class 'float'>,<class 'float'>
9,<class 'float'>,<class 'float'>,<class 'float'>


In [73]:
r = df1.apply(lambda x: type(x))
r

NAME      <class 'pandas.core.series.Series'>
AGE       <class 'pandas.core.series.Series'>
GENDER    <class 'pandas.core.series.Series'>
dtype: object

In [None]:
# groups: groupby

In [None]:
# groups and aggregates