In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
arr = [1,2,4,5,8]
pd.Series(arr)

0    1
1    2
2    4
3    5
4    8
dtype: int64

In [3]:
pd.Series(arr , index=['USA','IRAN','ITALY','UAE' , 'CHINA' ])

USA      1
IRAN     2
ITALY    4
UAE      5
CHINA    8
dtype: int64

In [4]:
df = pd.DataFrame(np.random.randn(5,4), index = 'a b c d e'.split(), columns=('USA' , 'IRAN', 'UAE' , 'GERMANI'))

In [5]:
df['NEW_COUNTRY']  = df['USA'] + df['IRAN']

In [6]:
df.drop('NEW_COUNTRY' , axis =1 , inplace=True)

In [7]:
df

Unnamed: 0,USA,IRAN,UAE,GERMANI
a,-0.095479,1.768812,1.421843,1.097421
b,1.050449,0.690077,-0.253472,-0.244112
c,-0.769696,-1.133113,1.081495,1.058086
d,2.405741,0.773237,1.440197,-0.314453
e,1.167581,-1.358269,-0.771919,-0.835356


In [8]:
df.loc['d']

USA        2.405741
IRAN       0.773237
UAE        1.440197
GERMANI   -0.314453
Name: d, dtype: float64

In [9]:
df.iloc[3]

USA        2.405741
IRAN       0.773237
UAE        1.440197
GERMANI   -0.314453
Name: d, dtype: float64

In [10]:
df[df>0]

Unnamed: 0,USA,IRAN,UAE,GERMANI
a,,1.768812,1.421843,1.097421
b,1.050449,0.690077,,
c,,,1.081495,1.058086
d,2.405741,0.773237,1.440197,
e,1.167581,,,


In [11]:
df[(df['USA']>0) & (df['UAE']<1)]

Unnamed: 0,USA,IRAN,UAE,GERMANI
b,1.050449,0.690077,-0.253472,-0.244112
e,1.167581,-1.358269,-0.771919,-0.835356


In [12]:
df[(df['USA']>0) | (df['UAE']<1)]

Unnamed: 0,USA,IRAN,UAE,GERMANI
b,1.050449,0.690077,-0.253472,-0.244112
d,2.405741,0.773237,1.440197,-0.314453
e,1.167581,-1.358269,-0.771919,-0.835356


In [13]:
df.reset_index()

Unnamed: 0,index,USA,IRAN,UAE,GERMANI
0,a,-0.095479,1.768812,1.421843,1.097421
1,b,1.050449,0.690077,-0.253472,-0.244112
2,c,-0.769696,-1.133113,1.081495,1.058086
3,d,2.405741,0.773237,1.440197,-0.314453
4,e,1.167581,-1.358269,-0.771919,-0.835356


# Groups

In [14]:
outside = 'G1 G1 G1 G2 G2 G2'.split()
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))

In [15]:
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [16]:
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [17]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [18]:
df  = pd.DataFrame(np.random.randn(6,2) , hier_index , 'A B'.split())

In [19]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.417316,0.148861
G1,2,-0.849438,0.34564
G1,3,1.201594,0.154953
G2,1,-0.419977,0.570574
G2,2,0.536014,0.920767
G2,3,-0.327914,1.111206


In [20]:
df.index.names = ['Groups' , 'Numbers']

In [21]:
df.loc['G1'].loc[1]

A   -0.417316
B    0.148861
Name: 1, dtype: float64

In [22]:
df.xs(1 , level='Numbers')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.417316,0.148861
G2,-0.419977,0.570574


In [23]:
arr2 = [[1,2,np.nan],[3,np.nan,np.nan],[3,4,5], [5,6,7]]
df2 = pd.DataFrame({'A':arr2[0], 'B':arr2[1],'C':arr2[2], 'D':arr2[3]})

In [24]:
df2['A'].fillna(value=df2['A'].mean(), inplace=True)

In [25]:
df2

Unnamed: 0,A,B,C,D
0,1.0,3.0,3,5
1,2.0,,4,6
2,1.5,,5,7


# Group_by

In [26]:
data = {'Company':'GOOG GOOG MSFT MSFT FB FB'.split(),
       'Person': 'Hossein Sami Fati Sama Ahmad Mali'.split(),
       'Sales': [200, 120,  340, 124 , 243, 360]}
dff = pd.DataFrame(data)

In [27]:
dff

Unnamed: 0,Company,Person,Sales
0,GOOG,Hossein,200
1,GOOG,Sami,120
2,MSFT,Fati,340
3,MSFT,Sama,124
4,FB,Ahmad,243
5,FB,Mali,360


In [28]:
dff.groupby('Company').std().loc['FB']

Sales    82.731493
Name: FB, dtype: float64

In [29]:
dff.groupby('Company').max()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,Mali,360
GOOG,Sami,200
MSFT,Sama,340


In [30]:
dff.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,301.5,82.731493,243.0,272.25,301.5,330.75,360.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


# Operations

In [34]:
op_df = pd.DataFrame({'col1':[1,2,3,4,5],
                     'col2': [444,555,666,444,555],
                     'col3': 'abc def ghi jkl mno'.split()})

In [38]:
op_df['col2'].nunique()

3

In [47]:
op_df['col2'].value_counts()

444    2
555    2
666    1
Name: col2, dtype: int64

In [54]:
def times2(val):
    return val*2
op_df['col1'].apply(times2)

0     2
1     4
2     6
3     8
4    10
Name: col1, dtype: int64

In [56]:
op_df['col3'].apply(len)

0    3
1    3
2    3
3    3
4    3
Name: col3, dtype: int64

In [58]:
op_df['col1'].apply(lambda x: x*2)

0     2
1     4
2     6
3     8
4    10
Name: col1, dtype: int64

In [60]:
op_df.sort_values(by='col2')

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,jkl
1,2,555,def
4,5,555,mno
2,3,666,ghi


In [61]:
op_df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
