In [1]:
# all imports go at the top, even though python is a very open language
import numpy as np
import pandas as pd

## Series

In [8]:
list('abcd')

['a', 'b', 'c', 'd']

In [2]:
# Normal series and series with custom index
s1 = pd.Series([1,2,3], dtype='int8')
s2 = pd.Series([1,2,3, np.nan], index=list("abcd"))
s3 = pd.Series([1,2,3,4,5], dtype='int8')
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int8

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [3]:
# changing dtype: astype
print(s1.astype('float64'))
print(s1.astype('str'))
print(s1.dtype)

0    1.0
1    2.0
2    3.0
dtype: float64
0    1
1    2
2    3
dtype: object
int8


In [4]:
# shape, size, dtype, index, values
print(s1.shape)
print(s1.size)
print(s1.dtype)
print(s1.index)
print(s1.values)

(3,)
3
int8
RangeIndex(start=0, stop=3, step=1)
[1 2 3]


In [6]:
print(s1, "\n", s2)


0    1
1    2
2    3
dtype: int8 
 a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [8]:
a1 = np.array([1,2,np.nan])

In [10]:
s1 = pd.Series([1,2,np.nan])

print(a1.sum())

s1.sum()

nan


3.0

In [7]:
# aggregate operations: min, max, unique, sum etc.
print(s1.min(), s2.min())
print(s1.max(), s2.max())
print(s1.unique(), s2.unique())
print(s1.sum(), s2.sum())
print(s1.any(), s2.any())
print(s1.all(), s2.all())

1 1.0
3 3.0
[1 2 3] [ 1.  2.  3. nan]
6 6.0
True True
True True


In [13]:
# arithmetic operations(new copy) and broadcast like numpy
print(s1*10)
print(s1)
print(s1-s3)

0    10
1    20
2    30
dtype: int8
0    1
1    2
2    3
dtype: int8
0    0.0
1    0.0
2    0.0
3    NaN
4    NaN
dtype: float64


In [14]:
# indexing and slicing
# indexing operations results in a scalar
# slicing gives a series
print(s1[1])
print(s2['a':])

# for slicing a integer index - last slice is not taken
# for slicing a object index - last slice is taken
print(s1[1:3])
print(s2['a':'c'])

2
a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64
1    2
2    3
dtype: int8
a    1.0
b    2.0
c    3.0
dtype: float64


In [15]:
# result of operations as index
#Can use numpy array, pandas series or just a list inside an index 
r = s2%2!=0
print(r)
s2[r]=999
print(s2)

a     True
b    False
c     True
d     True
dtype: bool
a    999.0
b      2.0
c    999.0
d    999.0
dtype: float64


In [11]:
# frequency : value_counts
print(s1)

print(s1.value_counts)

0    1.0
1    2.0
2    NaN
dtype: float64
<bound method IndexOpsMixin.value_counts of 0    1.0
1    2.0
2    NaN
dtype: float64>


## DataFrame

In [2]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=["Name", "Number"])
print(df1)
print()
print(df2)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

         Name  Number
0      Gaurav       1
11   Abhiskek       2
2     Krishna       3
3   Abhishek2       4


In [32]:
df1.rename(columns = dict(zip(df1.columns, ['N', 'A', 'G'])))

Unnamed: 0,N,A,G
0,Gaurav,0.0,0.0
1,Abhishek1,1.0,0.0
2,Krishna,2.0,0.0
3,Abhishek2,2.0,0.0
4,Harshita,1.0,1.0
5,Joey,2.0,1.0
6,Shweta,3.0,1.0
7,na-1,1.0,
8,na-2,,0.0
9,,2.0,0.0


In [13]:
# shape, size, dtype, index, columns, unique, T
print(df1.shape, df2.shape)
print(df1.size, df2.size)
print(df1.dtypes,df2.dtypes)
print(df1.index, df2.index)
print(df1.columns, df2.columns)
# print(df1.unique)
print(df1.T, df2.T)

(10, 3) (4, 2)
30 8
Name       object
Age       float64
Gender    float64
dtype: object Name      object
Number     int64
dtype: object
RangeIndex(start=0, stop=10, step=1) Int64Index([0, 11, 2, 3], dtype='int64')
Index(['Name', 'Age', 'Gender'], dtype='object') Index(['Name', 'Number'], dtype='object')
             0          1        2          3         4     5       6     7  \
Name    Gaurav  Abhishek1  Krishna  Abhishek2  Harshita  Joey  Shweta  na-1   
Age          0          1        2          2         1     2       3     1   
Gender       0          0        0          0         1     1       1   NaN   

           8    9  
Name    na-2  NaN  
Age      NaN    2  
Gender     0    0               0         11       2          3 
Name    Gaurav  Abhiskek  Krishna  Abhishek2
Number       1         2        3          4


In [23]:
type(df2.index)

pandas.core.indexes.numeric.Int64Index

In [19]:
# Rename index and columns on df2.
# Inplace vs normal operation


In [34]:
# Indexing columns directly and via index. 
print(df1.Name)
df1['Name']

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: Name, dtype: object


0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: Name, dtype: object

In [36]:
# Series in a DataFrame

print(df1.Name.unique())
print(type(df1.Age))
print(df1.Age.value_counts())

['Gaurav' 'Abhishek1' 'Krishna' 'Abhishek2' 'Harshita' 'Joey' 'Shweta'
 'na-1' 'na-2' nan]
<class 'pandas.core.series.Series'>
2.0    4
1.0    3
3.0    1
0.0    1
Name: Age, dtype: int64


In [39]:
# describe, top/bottom rows of data

print(df1.describe())
print(df1.describe(include = 'all'))

print(df1.head())
print(df1.tail())

            Age    Gender
count  9.000000  9.000000
mean   1.555556  0.333333
std    0.881917  0.500000
min    0.000000  0.000000
25%    1.000000  0.000000
50%    2.000000  0.000000
75%    2.000000  1.000000
max    3.000000  1.000000
          Name       Age    Gender
count        9  9.000000  9.000000
unique       9       NaN       NaN
top     Gaurav       NaN       NaN
freq         1       NaN       NaN
mean       NaN  1.555556  0.333333
std        NaN  0.881917  0.500000
min        NaN  0.000000  0.000000
25%        NaN  1.000000  0.000000
50%        NaN  2.000000  0.000000
75%        NaN  2.000000  1.000000
max        NaN  3.000000  1.000000
        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
     Name  Age  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0


In [51]:
# indexing and slicing
# loc, iloc [rows, cols]

# Loc
# df1.loc[0:1, 'Name'::2]
# df1.iloc[1:, 2:]
df1.loc[0:7,'Name':'Age']
df2.loc[11:3,'Name':'Number']

Unnamed: 0,Name,Number
11,Abhiskek,2
2,Krishna,3
3,Abhishek2,4


In [165]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...
# print(df1)
# print(df1.min(axis=1))
# df1.max()
# print(df1.count(axis=0))
df1['Name'][9]=np.nan
df1['Name'][9]
# print(df1['Name'].min())
df1.loc[0:9][0:1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,Name,Age,Gender
0,Gaurav,0.0,0.0


In [3]:
# arithmetic operations
# print(df1.dtypes)
print(df1*2) #this works fine because * operator works on both strings and numbers
print(df1-2) # this gives error 
df1[df1.loc[0:9,['Age','Gender']]>0]
df1.count(axis=1)

                 Name  Age  Gender
0        GauravGaurav  0.0     0.0
1  Abhishek1Abhishek1  2.0     0.0
2      KrishnaKrishna  4.0     0.0
3  Abhishek2Abhishek2  4.0     0.0
4    HarshitaHarshita  2.0     2.0
5            JoeyJoey  4.0     2.0
6        ShwetaShweta  6.0     2.0
7            na-1na-1  2.0     NaN
8            na-2na-2  NaN     0.0
9                 NaN  4.0     0.0


TypeError: unsupported operand type(s) for -: 'str' and 'int'

In [12]:
#isna testing

df1.isna()

df1.isna().any(axis=1)

df1.isna().sum(axis=1).sort_values(ascending=False).head()

9    1
8    1
7    1
6    0
5    0
dtype: int64

In [25]:
import numpy as np
arr = [[1]*6]*6

In [14]:
# Na Values: isna, fillna, dropna

df1.fillna(method='ffill')

# print(df1.isnull())
# print(df1.isnull().sum(axis=0))
# print(df1.mean(axis=1))
df2=df1.copy()
# df2.mean(axis=1)
# df2.fillna((df2.mean(axis=0)))
df2.dropna(axis=0)


Unnamed: 0,Name,Age,Gender
0,Gaurav,0.0,0.0
1,Abhishek1,1.0,0.0
2,Krishna,2.0,0.0
3,Abhishek2,2.0,0.0
4,Harshita,1.0,1.0
5,Joey,2.0,1.0
6,Shweta,3.0,1.0
7,na-1,1.0,1.0
8,na-2,1.0,0.0
9,na-2,2.0,0.0


In [27]:
# dropna
print(df1)
r = df1.dropna()
print(r)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0
        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0


In [28]:
# any, all

In [245]:
# ordering data, sort_values
# print(df1.sort_values(by=['Age','Name'], ascending=[True, False]))
print(df1.sort_values(by=['Age','Name'], ascending=[False,False]))
print(df1)

        Name  Age  Gender
6     Shweta  3.0     1.0
2    Krishna  2.0     0.0
5       Joey  2.0     1.0
3  Abhishek2  2.0     0.0
9        NaN  2.0     0.0
7       na-1  1.0     NaN
4   Harshita  1.0     1.0
1  Abhishek1  1.0     0.0
0     Gaurav  0.0     0.0
8       na-2  NaN     0.0
        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [249]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4))
print(tmp)


    0   1   2   3
0  18  17  15  14
1  16  16  10  11
2  17  18  19  13
3  16  19  16  10
4  15  13  15  16
0    64
1    53
2    67
3    61
4    59
dtype: int64


In [253]:
print(tmp.apply(lambda x: x*x, axis=1))

     0    1    2    3
0  324  289  225  196
1  256  256  100  121
2  289  324  361  169
3  256  361  256  100
4  225  169  225  256


In [262]:
# str submodule (replace etc)
df1['Name'].str.lower()
print(dir(df1['Name'].str))

['__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__frozen', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_doc_args', '_freeze', '_get_series_list', '_inferred_dtype', '_is_categorical', '_is_string', '_make_accessor', '_orig', '_parent', '_validate', '_wrap_result', 'capitalize', 'casefold', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'r

In [None]:
# load and save data

In [269]:
# groups: groupby
grp = df1.groupby(by=['Age'])
grp.groups

{0.0: Int64Index([0], dtype='int64'),
 1.0: Int64Index([1, 4, 7], dtype='int64'),
 2.0: Int64Index([2, 3, 5, 9], dtype='int64'),
 3.0: Int64Index([6], dtype='int64')}

In [34]:
grp = df1.groupby(by=['Age','Gender'])#.agg=['sum', 'count']
grp.groups

{(0.0, 0.0): Int64Index([0], dtype='int64'),
 (1.0, 0.0): Int64Index([1], dtype='int64'),
 (1.0, 1.0): Int64Index([4], dtype='int64'),
 (1.0, nan): Int64Index([7], dtype='int64'),
 (2.0, 0.0): Int64Index([2, 3, 9], dtype='int64'),
 (2.0, 1.0): Int64Index([5], dtype='int64'),
 (3.0, 1.0): Int64Index([6], dtype='int64'),
 (nan, 0.0): Int64Index([8], dtype='int64')}

In [273]:
print(dir(grp))

['Age', 'Gender', 'Name', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_accessors', '_add_numeric_operations', '_agg_examples_doc', '_agg_see_also_doc', '_aggregate', '_aggregate_frame', '_aggregate_item_by_item', '_aggregate_multiple_funcs', '_apply_filter', '_apply_to_column_groupbys', '_apply_whitelist', '_assure_grouper', '_bool_agg', '_builtin_table', '_choose_path', '_concat_objects', '_constructor', '_cumcount_array', '_cython_agg_blocks', '_cython_agg_general', '_cython_table', '_cython_transform', '_define_paths', '_deprecations', '_dir_additions', '_dir_deletions', '_ensure_type', '_fill', '_get_cython_func', '

In [35]:
# groups and aggregates
dir(globals())

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [37]:
vars().keys()

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', '_i', '_ii', '_iii', '_i1', 'np', 'pd', '_i2', 'df1', 'df2', '_i3', '_i4', '_4', '_i5', '_5', '_i6', '_6', '_i7', '_7', '_i8', '_i9', '_9', '_i10', '_i11', '_11', '_i12', '_12', '_i13', '_13', '_i14', '_14', '_i15', 'a', '_i16', '_i17', '_17', '_i18', '_18', '_i19', 'item', '_i20', '_20', '_i21', '_i22', '_i23', 'arr', '_i24', 'i', 'j', '_i25', '_i26', '_26', '_i27', '_27', '_i28', '_i29', '_i30', '_i31', '_31', '_i32', '_i33', 'grp', '_i34', '_34', '_i35', '_35', '_i36', '_36', '_i37'])

In [38]:
print(df1)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0
