In [2]:
import pandas as pd
import numpy as np

# Series
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index

### From ndarray

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
#if no index is passed then index will be created having values [0, ..., len(data) - 1].
s

a   -1.514287
b   -0.246546
c    0.310865
d    0.395407
e   -0.259977
dtype: float64

### From dict

In [5]:
d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)    

b    1
a    0
c    2
dtype: int64

### From Scaler value

In [8]:
s = pd.Series(5, index = ['a', 'b', 'c'])
s

a    5
b    5
c    5
dtype: int64

## Series is like nd-array

In [4]:
s = pd.Series(np.random.randn(7), index=['a', 'b', 'c', 'd', 'e', 'f','g'])
s

a   -0.557856
b   -0.927423
c   -2.357303
d   -0.664084
e   -1.192442
f   -0.493418
g    0.585303
dtype: float64

In [5]:
s[0]

-0.5578556419837404

In [6]:
s[:3]

a   -0.557856
b   -0.927423
c   -2.357303
dtype: float64

In [7]:
s[s > s.median()]

a   -0.557856
f   -0.493418
g    0.585303
dtype: float64

In [8]:
s[[4, 3, 1]]

e   -1.192442
d   -0.664084
b   -0.927423
dtype: float64

### Name Attribute

In [11]:
s = pd.Series(np.random.randn(5), name='something')
s

0   -0.638622
1   -0.912505
2    0.458721
3   -1.050359
4    0.734116
Name: something, dtype: float64

# Dataframe
DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.

### From dict of Series or dicts

In [12]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [13]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


### From dict of ndarrays / lists

In [14]:
d = {'one': [1., 2., 3., 4.],
    'two': [4., 3., 2., 1.]}

pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [15]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


### From structured or record array

In [18]:
data = []
data[:] = [(1, 2., 'Hello'), (2, 3., "World")]

df = pd.DataFrame(data, index = ['First' , 'Second'], columns = ['C' , 'A', 'B'])
df

Unnamed: 0,C,A,B
First,1,2.0,Hello
Second,2,3.0,World


### From a list of dicts

In [19]:
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


### DataFrame.from_dict

In [1]:
k = [('A', [1, 2, 3]), ('B', [4, 5, 6])]
d = dict(k)
d

{'A': [1, 2, 3], 'B': [4, 5, 6]}

In [8]:
df = pd.DataFrame.from_dict(d)
df

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [5]:
#if we want A and B to be rows
df1 = pd.DataFrame.from_dict(d, orient='index')
df1

Unnamed: 0,0,1,2
A,1,2,3
B,4,5,6


### Working with dataframes
Columns can be deleted or popped like with a dict:

In [9]:
del df['A']
three = df.pop('B')
df

0
1
2


In [10]:
three

0    4
1    5
2    6
Name: B, dtype: int64

In [13]:
df['foo'] = 'bar'
df["one"] = [1,2,4]
df

Unnamed: 0,foo,one
0,bar,1
1,bar,2
2,bar,4


In [23]:
df['one_trunc'] = df['one'][:3]
df

Unnamed: 0,foo,one,one_trunc
0,bar,1,1
1,bar,2,2
2,bar,4,4


### Assign method
DataFrame has an assign() method that allows you to easily create new columns that are potentially derived from existing columns.

In [31]:
# assign() method
df = (df.assign(kunal = df['one'] + df['one_trunc']).head())
df

Unnamed: 0,foo,one,one_trunc,kunal
0,bar,1,1,2
1,bar,2,2,4
2,bar,4,4,8


In the example above, we inserted a precomputed value. We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to.

In [32]:
df.assign(kunal_new = lambda x: (x['one'] + x['one_trunc'] + x['kunal']))

Unnamed: 0,foo,one,one_trunc,kunal,kunal_new
0,bar,1,1,2,4
1,bar,2,2,4,8
2,bar,4,4,8,16


In [33]:
dfa = pd.DataFrame({"A": [1, 2, 3],
    "B": [4, 5, 6]})

In [34]:
dfa.assign(C=lambda x: x['A'] + x['B'],
      D=lambda x: x['A'] + x['C'])

Unnamed: 0,A,B,C,D
0,1,4,5,6
1,2,5,7,9
2,3,6,9,12


### Operation 	Syntax 	Result

     Select column      	             df[col] 	Series
    Select row by label             	 df.loc[label] 	Series
    Select row by integer location 	     df.iloc[loc] 	Series
    Slice rows 	                         df[5:10] 	DataFrame
    Select rows by boolean vector 	     df[bool_vec] 	DataFrame

### Data alignment and arithmetic

In [47]:
df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.454091,-0.749402,-1.135324,-3.249297
1,0.547888,-0.093672,0.725329,0.402295
2,0.157358,-1.439405,0.318236,-0.038935
3,-0.587232,0.636113,-1.07103,0.841065
4,-1.270114,1.816919,-1.578404,0.230332
5,0.95668,-1.020633,-0.768718,-0.393888
6,0.250419,-0.058515,-1.179882,-0.594232
7,1.72296,1.706741,0.189615,-0.070573
8,-0.959256,-0.864741,1.752606,-0.394829
9,-0.556996,-0.301439,0.212587,0.386711


In [48]:
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df2

Unnamed: 0,A,B,C
0,-0.27998,-0.661967,-0.965778
1,0.56241,-1.057074,1.071389
2,0.947786,-1.091873,2.019499
3,-0.472676,-1.513996,0.541383
4,-2.647192,0.181378,0.723889
5,-0.533396,0.440961,-0.072686
6,-0.661267,0.526261,1.812568


In [43]:
df + df2#gives union of df and df1

Unnamed: 0,A,B,C,D
0,-1.87687,0.345252,-0.326075,
1,-1.554339,0.938443,1.041414,
2,1.073169,-0.645688,-1.10101,
3,-1.502825,1.296484,0.092364,
4,-0.491759,1.542057,0.024399,
5,0.930698,1.35137,1.638027,
6,2.783687,1.505817,-1.542669,
7,,,,
8,,,,
9,,,,


In [50]:
index = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))
df

Unnamed: 0,A,B,C
2000-01-01,0.974738,-1.13961,-0.516355
2000-01-02,1.897879,-0.322625,1.041746
2000-01-03,-0.739693,0.896434,0.405306
2000-01-04,1.340073,0.366603,-1.386724
2000-01-05,1.112165,-0.276963,-1.632468
2000-01-06,0.433761,-0.235996,-0.542296
2000-01-07,0.91544,-0.534765,1.295107
2000-01-08,0.063778,0.029103,-0.164441


In [51]:
df.sub(df['A'], axis=0)

Unnamed: 0,A,B,C
2000-01-01,0.0,-2.114348,-1.491093
2000-01-02,0.0,-2.220505,-0.856133
2000-01-03,0.0,1.636128,1.144999
2000-01-04,0.0,-0.97347,-2.726796
2000-01-05,0.0,-1.389128,-2.744634
2000-01-06,0.0,-0.669758,-0.976057
2000-01-07,0.0,-1.450205,0.379667
2000-01-08,0.0,-0.034675,-0.228219


## Boolean Opeartions

In [8]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)
print(df1)
print(df2)

       a      b
0   True  False
1  False   True
2   True   True
       a      b
0  False   True
1   True   True
2   True  False


In [9]:
#for transposing use .T attribute
df1 = df1.T
df1

Unnamed: 0,0,1,2
a,True,False,True
b,False,True,True


In [10]:
#The dot method on DataFrame implements matrix multiplication
df.T.dot(df)

NameError: name 'df' is not defined

In [11]:
#Similarly, the dot method on Series implements dot product:
s1 = pd.Series(np.arange(5, 10))
s1.dot(s1)

255