# Pandas

#### Dataframes and Series

- dataframes is a multidimensional
- series is a one dimensional

In [1]:
# importing the libraries
import numpy as np
import pandas as pd

In [2]:
# creating a series
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# creating a dataframe by passing a numpy array
dates = pd.date_range('20130101', periods=6)

In [5]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [6]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [7]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.262757,-0.722359,-0.388285,0.848521
2013-01-02,0.291914,-0.706505,-1.198581,1.066132
2013-01-03,0.253489,0.644618,-0.698318,-0.803663
2013-01-04,-0.048804,0.366563,1.039526,-1.148384
2013-01-05,0.891338,-1.31993,1.000555,-0.113348
2013-01-06,-0.665571,-0.529516,-0.017647,-0.946171


In [8]:
# get 1st row
df.iloc[0]

A    0.262757
B   -0.722359
C   -0.388285
D    0.848521
Name: 2013-01-01 00:00:00, dtype: float64

In [9]:
df.head(1)

Unnamed: 0,A,B,C,D
2013-01-01,0.262757,-0.722359,-0.388285,0.848521


In [10]:
# get columns
df.A

2013-01-01    0.262757
2013-01-02    0.291914
2013-01-03    0.253489
2013-01-04   -0.048804
2013-01-05    0.891338
2013-01-06   -0.665571
Freq: D, Name: A, dtype: float64

In [11]:
labels = ['W', 'X', 'Y', 'Z']
list = [10, 20, 30, 40]
array = np.array([10, 20, 30, 40])
dict = {'w': 10, 'x': 30, 'y': 20, 'z': 40}

In [12]:
pd.Series(data=list)

0    10
1    20
2    30
3    40
dtype: int64

In [13]:
pd.Series(data=list, index=labels)

W    10
X    20
Y    30
Z    40
dtype: int64

In [14]:
# creating a series based on a dictionary
pd.Series(dict)

w    10
x    30
y    20
z    40
dtype: int64

In [15]:
a = {'w': 10, 'x': [30, 45, 89], 'y': ('ab', 'cd', 'ef'), 'z': {'a': 56, 'b': 89}}

In [16]:
pd.Series(a)

w                    10
x          [30, 45, 89]
y          (ab, cd, ef)
z    {'a': 56, 'b': 89}
dtype: object

### Using an index

#### indexing a series

In [17]:
sports1 = pd.Series([1, 2, 3, 4], index= ['Cricket', 'Football', 'Basketball', 'Golf'])

In [18]:
sports2 = pd.Series([1, 2, 5, 4], index= ['Cricket', 'Hockey', 'Basketball', 'Golf'])

In [19]:
sports1

Cricket       1
Football      2
Basketball    3
Golf          4
dtype: int64

In [20]:
sports2

Cricket       1
Hockey        2
Basketball    5
Golf          4
dtype: int64

In [21]:
sports1['Cricket']

1

##### Operations are also done based on index

In [22]:
sports1 + sports2

Basketball    8.0
Cricket       2.0
Football      NaN
Golf          8.0
Hockey        NaN
dtype: float64

In [23]:
np.random.seed(100)   # to fix the changing nature of random
dataframe = pd.DataFrame(np.random.randn(10, 5), index = 'A B C D E F G H I J'.split(), columns='Score1 Score2 Score3 Score4 Score5'.split())

In [24]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,-1.749765,0.34268,1.153036,-0.252436,0.981321
B,0.514219,0.22118,-1.070043,-0.189496,0.255001
C,-0.458027,0.435163,-0.583595,0.816847,0.672721
D,-0.104411,-0.53128,1.029733,-0.438136,-1.118318
E,1.618982,1.541605,-0.251879,-0.842436,0.184519
F,0.937082,0.731,1.361556,-0.326238,0.055676
G,0.2224,-1.443217,-0.756352,0.816454,0.750445
H,-0.455947,1.189622,-1.690617,-1.356399,-1.232435
I,-0.544439,-0.668172,0.007315,-0.612939,1.299748
J,-1.733096,-0.98331,0.357508,-1.613579,1.470714
