#  Pandas کار با کتابخانه

پانداس یکی از مهمترین کتابخانه هایی است که شامل دو ساختار مهم برای نگهداری اشیائ داده ای می باشد. 
این دو ساختار عبارتند از : سری ها و داده های چارچوب دار
    

## سری ها

سری ها همانند آرایه های سک بعدی هستند که با یک آرایه یک بعدی دیگر از برچسب ها به نام

index

متناظر شده است

In [6]:
from pandas import Series
obj = Series([4, 7, -5, 3])
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [11]:
 obj2['d'] 

4

In [12]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

In [13]:
 obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [17]:
import numpy as np
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [18]:
'e' in obj2

False

In [25]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [26]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [27]:
obj4 = Series(sdata, index=states)

In [22]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [24]:
import pandas as pd
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

## DataFrame

In [29]:
from pandas import DataFrame
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

In [30]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [31]:
frame = DataFrame(data, columns=['year', 'state', 'pop'])
frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [32]:
frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [33]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [34]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [40]:
 frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [42]:
 frame2.iloc[0:2]

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,


In [43]:
frame2['debt'] = np.arange(5.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [45]:
frame2['eastern'] = (frame2.state == 'Ohio')
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,0.0,True
two,2001,Ohio,1.7,1.0,True
three,2002,Ohio,3.6,2.0,True
four,2001,Nevada,2.4,3.0,False
five,2002,Nevada,2.9,4.0,False


In [46]:
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [47]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [48]:
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [54]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [61]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

### Indexing, selection, and filtering

In [70]:
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [71]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [72]:
data = DataFrame(np.arange(16).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [73]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [74]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [75]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [76]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [77]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [79]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [80]:
data.iloc[1, 1:3]

two      5
three    6
Name: Colorado, dtype: int32

In [81]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


## Summarizing and Computing Descriptive Statistics

In [82]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
                index=['a', 'b', 'c', 'd'],
                columns=['one', 'two'])

In [83]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [84]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [85]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [86]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [88]:
df.idxmax()

one    b
two    d
dtype: object

In [91]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## Reading and Writing Data in Text Format

In [1]:
import pandas as pd
bank = pd.read_csv('Datasets/customer_bank_data.csv',na_values=['$null$'])
bank[0:6]

Unnamed: 0,CUSTOMERID,AGE,EDUCATION,YEARSEMPLOYED,INCOME,CARDDEBT,OTHERDEBT,DEFAULTED,ADDRESS
0,1,41,UNDER GRADUATE,6,19,0.124,1.073,0.0,NBA001
1,2,47,SCHOOL,26,100,4.582,8.218,0.0,NBA021
2,3,33,UNDER GRADUATE,10,57,6.111,5.802,1.0,NBA013
3,4,29,UNDER GRADUATE,4,19,0.681,0.516,0.0,NBA009
4,5,47,SCHOOL,31,253,9.308,8.908,0.0,NBA008
5,6,40,SCHOOL,23,81,0.998,7.831,,NBA016


In [2]:
pd.isnull(bank)[0:6]

Unnamed: 0,CUSTOMERID,AGE,EDUCATION,YEARSEMPLOYED,INCOME,CARDDEBT,OTHERDEBT,DEFAULTED,ADDRESS
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,True,False


In [3]:
bank[0:6].to_csv('Datasets/bank.csv', index=False)