In [1]:
import numpy as np
import pandas as pd

# Series

In [2]:
myList = [10, 20, 30, 40, 50]
myLabels = ['a', 'b', 'c', 'd', 'e']

In [3]:
s1 = pd.Series(myList)
s1

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
s1[0]

10

In [5]:
s2 = pd.Series(data=myList, index=myLabels)
s2

a    10
b    20
c    30
d    40
e    50
dtype: int64

In [6]:
s2[0]

10

In [7]:
s2['a']

10

# DataFrame

In [8]:
df = pd.DataFrame(data=np.random.randint(1, 101, (4, 5)), 
            index=['A', 'B', 'C', 'D'], 
            columns=['V', 'W', 'X', 'Y', 'Z'])

df

Unnamed: 0,V,W,X,Y,Z
A,95,34,48,58,16
B,51,22,55,75,42
C,13,54,87,19,64
D,29,21,69,34,45


In [9]:
df['W']

A    34
B    22
C    54
D    21
Name: W, dtype: int64

In [10]:
df['W']['C']

54

In [11]:
df[['W', 'Z']]

Unnamed: 0,W,Z
A,34,16
B,22,42
C,54,64
D,21,45


In [12]:
df.loc['B']

V    51
W    22
X    55
Y    75
Z    42
Name: B, dtype: int64

In [13]:
df.iloc[0]

V    95
W    34
X    48
Y    58
Z    16
Name: A, dtype: int64

In [14]:
df['New'] = [10, 10, 10, 10]

In [15]:
df

Unnamed: 0,V,W,X,Y,Z,New
A,95,34,48,58,16,10
B,51,22,55,75,42,10
C,13,54,87,19,64,10
D,29,21,69,34,45,10


In [16]:
df['New'] = [100, 100, 100, 100]

In [17]:
df

Unnamed: 0,V,W,X,Y,Z,New
A,95,34,48,58,16,100
B,51,22,55,75,42,100
C,13,54,87,19,64,100
D,29,21,69,34,45,100


In [18]:
df.drop('New', axis=1)

Unnamed: 0,V,W,X,Y,Z
A,95,34,48,58,16
B,51,22,55,75,42
C,13,54,87,19,64
D,29,21,69,34,45


In [19]:
df

Unnamed: 0,V,W,X,Y,Z,New
A,95,34,48,58,16,100
B,51,22,55,75,42,100
C,13,54,87,19,64,100
D,29,21,69,34,45,100


In [20]:
df.drop('New', axis=1, inplace=True)

In [21]:
df

Unnamed: 0,V,W,X,Y,Z
A,95,34,48,58,16
B,51,22,55,75,42
C,13,54,87,19,64
D,29,21,69,34,45


In [22]:
df.shape

(4, 5)

# Conditional Selection

In [23]:
df

Unnamed: 0,V,W,X,Y,Z
A,95,34,48,58,16
B,51,22,55,75,42
C,13,54,87,19,64
D,29,21,69,34,45


In [30]:
df['Z'] % 2 == 0

A     True
B     True
C     True
D    False
Name: Z, dtype: bool

In [33]:
df[df['Z'] % 2 == 0]

Unnamed: 0,V,W,X,Y,Z
A,95,34,48,58,16
B,51,22,55,75,42
C,13,54,87,19,64


# Missing Values

In [34]:
d = {'A':[1, 2, np.nan], 
    'B':[5, np.nan, np.nan], 
    'C':[1, 2, 3]}

In [35]:
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [36]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [37]:
df.isnull().sum()

A    1
B    2
C    0
dtype: int64

In [38]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [40]:
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,0.0,2
2,0.0,0.0,3


In [41]:
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,5.0,2
2,1.5,5.0,3


In [42]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [46]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


# Grouping

In [48]:
df = pd.DataFrame(
    {'Company':['GOOGLE', 'FB', 'MSFT', 'FB', 'GOOGLE', 'GOOGLE'],
    'Employee':['Alex', 'Michael', 'Smith', 'Pam', 'Eric', 'George'],
     'Sales':[450, 500, 120, 1000, 50, 300]})

df

Unnamed: 0,Company,Employee,Sales
0,GOOGLE,Alex,450
1,FB,Michael,500
2,MSFT,Smith,120
3,FB,Pam,1000
4,GOOGLE,Eric,50
5,GOOGLE,George,300


In [49]:
df['Sales'].mean()

403.3333333333333

In [50]:
df.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7faf2d46afa0>

In [51]:
df.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,750.0
GOOGLE,266.666667
MSFT,120.0


In [52]:
df.groupby('Company').min()

Unnamed: 0_level_0,Employee,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Michael,500
GOOGLE,Alex,50
MSFT,Smith,120


In [53]:
df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,750.0,353.553391,500.0,625.0,750.0,875.0,1000.0
GOOGLE,3.0,266.666667,202.072594,50.0,175.0,300.0,375.0,450.0
MSFT,1.0,120.0,,120.0,120.0,120.0,120.0,120.0


# Unique Values

In [54]:
df

Unnamed: 0,Company,Employee,Sales
0,GOOGLE,Alex,450
1,FB,Michael,500
2,MSFT,Smith,120
3,FB,Pam,1000
4,GOOGLE,Eric,50
5,GOOGLE,George,300


In [55]:
df['Company'].unique()

array(['GOOGLE', 'FB', 'MSFT'], dtype=object)

In [56]:
df['Company'].nunique()

3

In [57]:
df['Company'].value_counts()

GOOGLE    3
FB        2
MSFT      1
Name: Company, dtype: int64

# Custom Functions

In [58]:
def updateSales(sale):
    if sale < 500:
        return sale + 100
    else:
        return sale

In [59]:
df['Sales'].apply(updateSales)

0     550
1     500
2     220
3    1000
4     150
5     400
Name: Sales, dtype: int64

In [61]:
df['Sales'].apply(lambda s : s + 100)

0     550
1     600
2     220
3    1100
4     150
5     400
Name: Sales, dtype: int64