In [1]:
# all imports go at the top, even though python is a very open language
import pandas as pd
import numpy as np

## Series

In [39]:
# Normal series and series with custom index
s1 = pd.Series([1,2,3])
s2 = pd.Series([1,2,3, np.nan], index=list("abcd"))
print(s1)
print()
print(s2)

0    1
1    2
2    3
dtype: int64

a    1.0
b    2.0
c    3.0
d    NaN
dtype: float64


In [40]:
# changing dtype: astype
s1.astype('float64')

0    1.0
1    2.0
2    3.0
dtype: float64

In [45]:
s = pd.Series(["1", "$2", "3.5"])
ss = s.str.replace('$','')
ss.astype('float64')

0    1.0
1    2.0
2    3.5
dtype: float64

In [4]:
# shape, size, dtype, index, values

In [5]:
# aggregate operations: min, max, unique, sum etc.

In [6]:
# arithmetic operations(new copy) and broadcast like numpy

In [7]:
# indexing and slicing

In [8]:
# result of operations as index

In [9]:
# frequency : value_counts

## DataFrame

In [10]:
# dataframe from dict or list of rows
df1 = pd.DataFrame(
    {
        "Name":   ["Gaurav", "Abhishek1", "Krishna", "Abhishek2", "Harshita", "Joey", "Shweta", "na-1" , "na-2"  , np.nan],
        "Age":    [ 0      , 1          , 2        , 2          , 1         , 2     , 3       , 1      ,  np.nan , 2     ],
        "Gender": [ 0      , 0          , 0        , 0          , 1         , 1     , 1       , np.nan ,  0      , 0     ]
    }
)
df2 = pd.DataFrame([("Gaurav",1), ("Abhiskek",2), ("Krishna",3), ("Abhishek2",4)],
                    index=[0,11,2,3], 
                    columns=[10, 200])
print(df1)
print()
print(df2)
print(df2.columns)

        Name  Age  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0

          10   200
0      Gaurav    1
11   Abhiskek    2
2     Krishna    3
3   Abhishek2    4
Int64Index([10, 200], dtype='int64')


In [11]:
# shape, size, dtype, index, columns, unique, T
print(df1.shape)
print(df1.size)

print()
print(df1.dtypes, type(df1.dtypes))

print()
print(df1.index)

print()
print(df1.columns)
#print(df1.unique)

print()
print(df1.T)

(10, 3)
30

Name       object
Age       float64
Gender    float64
dtype: object <class 'pandas.core.series.Series'>

RangeIndex(start=0, stop=10, step=1)

Index(['Name', 'Age', 'Gender'], dtype='object')

             0          1        2          3         4     5       6     7  \
Name    Gaurav  Abhishek1  Krishna  Abhishek2  Harshita  Joey  Shweta  na-1   
Age          0          1        2          2         1     2       3     1   
Gender       0          0        0          0         1     1       1   NaN   

           8    9  
Name    na-2  NaN  
Age      NaN    2  
Gender     0    0  


In [15]:
# Rename index and columns on df2.
# Inplace vs normal operation
t = df1.rename({"Name":  "N", "Age": "A"}, axis = 1)
print(t.head(2))
print()
print(df1.head(2))
print();print()

t = df1.rename({"Name":  "N", "Age": "A"}, axis = 1, inplace=True)
print(t)
print()
print(df1)

t = df1.rename(lambda x: len(x), axis = 1) # col
print(t)

t = df1.rename(lambda x: x*10, axis =0) # row wise
print(t)


           N    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0

           N    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0


None

           N    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0
           1    1    6
0     Gaurav  0.0  0.0
1  Abhishek1  1.0  0.0
2    Krishna  2.0  0.0
3  Abhishek2  2.0  0.0
4   Harshita  1.0  1.0
5       Joey  2.0  1.0
6     Shweta  3.0  1.0
7       na-1  1.0  NaN
8       na-2  NaN  0.0
9        NaN  2.0  0.0
            N    A  Gender
0      Gaurav  0.0     0.0
10  Abhishek1  1.0     0.0
20    Krishna  2.0     0.0
30  Abhishek2  2.0     0.0
40   Harshita  1.0     1.0
50       Joey  2.0     1.0
60     Shweta  3.0     1.0
70       na-1  1.0     NaN
80       na-2  NaN     0.0
90        NaN  2.0     0.0

In [16]:
# Indexing columns directly and via index. 
print(df1["Age"])

KeyError: 'Age'

In [14]:
# Series in a DataFrame
print(df1.Name)
print()
print(type(df1.Name))

AttributeError: 'DataFrame' object has no attribute 'Name'

In [17]:
# describe, top/bottom rows of data
print(df1.describe(include = 'all'))
print()
print(df1.head()) # top few rows
print()
print(df1.tail()) # top few rows


           N         A    Gender
count      9  9.000000  9.000000
unique     9       NaN       NaN
top     na-1       NaN       NaN
freq       1       NaN       NaN
mean     NaN  1.555556  0.333333
std      NaN  0.881917  0.500000
min      NaN  0.000000  0.000000
25%      NaN  1.000000  0.000000
50%      NaN  2.000000  0.000000
75%      NaN  2.000000  1.000000
max      NaN  3.000000  1.000000

           N    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0

        N    A  Gender
5    Joey  2.0     1.0
6  Shweta  3.0     1.0
7    na-1  1.0     NaN
8    na-2  NaN     0.0
9     NaN  2.0     0.0


In [18]:
# indexing and slicing
# loc, iloc [rows, cols]
print(df1.loc[0]) # rows  0     Gaurav  0.0     0.0
print()
print(df1.loc[0:5]) # rows

N         Gaurav
A              0
Gender         0
Name: 0, dtype: object

           N    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0


In [19]:
print(df1.loc[:     , "A"])
print()
print(df1.loc[:4,"A"])
print()
print(df1.loc[:4,"A"::2])

0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
5    2.0
6    3.0
7    1.0
8    NaN
9    2.0
Name: A, dtype: float64

0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
Name: A, dtype: float64

     A
0  0.0
1  1.0
2  2.0
3  2.0
4  1.0


In [20]:
print(df1.iloc[:,0])

print()
print(df1.iloc[3:8,1:])

0       Gaurav
1    Abhishek1
2      Krishna
3    Abhishek2
4     Harshita
5         Joey
6       Shweta
7         na-1
8         na-2
9          NaN
Name: N, dtype: object

     A  Gender
3  2.0     0.0
4  1.0     1.0
5  2.0     1.0
6  3.0     1.0
7  1.0     NaN


In [21]:
# aggregate operations (with and without axis : row-1, col-0)
# min, max, count, ...
print(df1.min())
print()
print(df1.max())
print()
print(df1.max(axis = 1))
print()
print(df1.count(axis = 1))

A         0.0
Gender    0.0
dtype: float64

A         3.0
Gender    1.0
dtype: float64

0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
5    2.0
6    3.0
7    1.0
8    0.0
9    2.0
dtype: float64

0    3
1    3
2    3
3    3
4    3
5    3
6    3
7    2
8    2
9    2
dtype: int64


In [22]:
print(df1.A.max())

3.0


In [23]:
# arithmetic operations
print(df1*10)
# print(df1+10)

print(df1.A == 9)

                                                   N     A  Gender
0  GauravGauravGauravGauravGauravGauravGauravGaur...   0.0     0.0
1  Abhishek1Abhishek1Abhishek1Abhishek1Abhishek1A...  10.0     0.0
2  KrishnaKrishnaKrishnaKrishnaKrishnaKrishnaKris...  20.0     0.0
3  Abhishek2Abhishek2Abhishek2Abhishek2Abhishek2A...  20.0     0.0
4  HarshitaHarshitaHarshitaHarshitaHarshitaHarshi...  10.0    10.0
5           JoeyJoeyJoeyJoeyJoeyJoeyJoeyJoeyJoeyJoey  20.0    10.0
6  ShwetaShwetaShwetaShwetaShwetaShwetaShwetaShwe...  30.0    10.0
7           na-1na-1na-1na-1na-1na-1na-1na-1na-1na-1  10.0     NaN
8           na-2na-2na-2na-2na-2na-2na-2na-2na-2na-2   NaN     0.0
9                                                NaN  20.0     0.0
0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: A, dtype: bool


In [24]:
# Na Values: isna, fillna, dropna
print(df1.isna())
print()
print(df1.isna().sum())

       N      A  Gender
0  False  False   False
1  False  False   False
2  False  False   False
3  False  False   False
4  False  False   False
5  False  False   False
6  False  False   False
7  False  False    True
8  False   True   False
9   True  False   False

N         1
A         1
Gender    1
dtype: int64


In [25]:
# any, all
print(df1.isna().any())
print()
print(df1.isna().any( axis=1))

N         True
A         True
Gender    True
dtype: bool

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8     True
9     True
dtype: bool


In [26]:
print(np.where(df1.N == "Harshita"))
print()
print(df1.get("A"))

(array([4], dtype=int64),)

0    0.0
1    1.0
2    2.0
3    2.0
4    1.0
5    2.0
6    3.0
7    1.0
8    NaN
9    2.0
Name: A, dtype: float64


In [27]:
# ordering data, sort_values
t = df1.sort_values(by=['A'])
print(t)

print()
t = df1.sort_values(by=['A', 'N'], ascending=False)
print(t)

           N    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
4   Harshita  1.0     1.0
7       na-1  1.0     NaN
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
5       Joey  2.0     1.0
9        NaN  2.0     0.0
6     Shweta  3.0     1.0
8       na-2  NaN     0.0

           N    A  Gender
6     Shweta  3.0     1.0
2    Krishna  2.0     0.0
5       Joey  2.0     1.0
3  Abhishek2  2.0     0.0
9        NaN  2.0     0.0
7       na-1  1.0     NaN
4   Harshita  1.0     1.0
1  Abhishek1  1.0     0.0
0     Gaurav  0.0     0.0
8       na-2  NaN     0.0


In [34]:
print(df1)

           N    A  Gender
0     Gaurav  0.0     0.0
1  Abhishek1  1.0     0.0
2    Krishna  2.0     0.0
3  Abhishek2  2.0     0.0
4   Harshita  1.0     1.0
5       Joey  2.0     1.0
6     Shweta  3.0     1.0
7       na-1  1.0     NaN
8       na-2  NaN     0.0
9        NaN  2.0     0.0


In [37]:
df1.dropna(axis=1)
# drop column which has less than 7 non-Na values => drop na uses this way => df1.dropna(thresh=7, axis=1)
# drop column which has more than 3 Na

0
1
2
3
4
5
6
7
8
9


In [38]:
df1.dropna()

Unnamed: 0,N,A,Gender
1,Abhishek1,1.0,0.0
2,Krishna,2.0,0.0
3,Abhishek2,2.0,0.0
4,Harshita,1.0,1.0
5,Joey,2.0,1.0
6,Shweta,3.0,1.0


In [None]:
df1.dropna(thresh=)

In [28]:
# row / column wise operation: apply
tmp = pd.DataFrame(np.random.randint(10,20,20).reshape(5,4))

In [29]:
# str submodule (replace etc)

In [30]:
# load and save data
df1.to_csv('new_data.csv')

In [31]:
# groups: groupby

In [32]:
# groups and aggregates

In [33]:
a = [1,2,3,4]

a[0]b

print("wowo")
a[10] # exception
print("owow")

wowo


IndexError: list index out of range