# Pandas Library

## کتابخانه پانداس

In [1]:
import pandas as pd
import numpy as np

### 1. سری ها

In [5]:
x0 = [5,3,8,7]
s1 = pd.Series(x0)
s1

0    5
1    3
2    8
3    7
dtype: int64

In [6]:
s1.values

array([5, 3, 8, 7], dtype=int64)

In [7]:
s2 = pd.Series(x0,index=['a','b','c','d'])
s2

a    5
b    3
c    8
d    7
dtype: int64

In [8]:
s2['b']

3

In [9]:
s2[1]

3

In [10]:
s2[['a','d']]

a    5
d    7
dtype: int64

In [11]:
np.sqrt(s2)

a    2.236068
b    1.732051
c    2.828427
d    2.645751
dtype: float64

In [12]:
s2

a    5
b    3
c    8
d    7
dtype: int64

In [13]:
'a' in s2

True

In [14]:
'f' in s2

False

In [15]:
8 in s2

False

In [2]:
# ساخت سری از یک دیکشنری

data = {'Bushehr':25000,'Borazjan':12000,'Kangan':1500}
s3 = pd.Series(data)
s3

Bushehr     25000
Borazjan    12000
Kangan       1500
dtype: int64

In [9]:
city = ['Kangan','Bushehr','Borazjan']
s4 = pd.Series(data,index=city)
s4

Kangan       1500.0
Bushehr     25000.0
Borazjan    12000.0
dtype: float64

In [8]:
city = ['Kangan','Bushehr','Borazjan','Khormoj']
s5 = pd.Series(data,index=city)
s5

Kangan       1500.0
Bushehr     25000.0
Borazjan    12000.0
Khormoj         NaN
dtype: float64

In [10]:
s5.isnull()

Kangan      False
Bushehr     False
Borazjan    False
Khormoj      True
dtype: bool

In [11]:
pd.isnull(s5)

Kangan      False
Bushehr     False
Borazjan    False
Khormoj      True
dtype: bool

In [12]:
pd.Series(5,index=range(7))

0    5
1    5
2    5
3    5
4    5
5    5
6    5
dtype: int64

## DataFrame

In [13]:
data = {'Name':['Ahmad','Zahra','Ali'],
        'Amar':[20,18,19],
        'Computer':[18,19,17]}      
data

{'Name': ['Ahmad', 'Zahra', 'Ali'],
 'Amar': [20, 18, 19],
 'Computer': [18, 19, 17]}

In [14]:
f1 = pd.DataFrame(data)
f1

Unnamed: 0,Name,Amar,Computer
0,Ahmad,20,18
1,Zahra,18,19
2,Ali,19,17


In [18]:
f2 = pd.DataFrame(data,columns=['Name','Amar','Riazi'])
f2

Unnamed: 0,Name,Amar,Riazi
0,Ahmad,20,
1,Zahra,18,
2,Ali,19,


In [19]:
f1['Computer']

0    18
1    19
2    17
Name: Computer, dtype: int64

In [20]:
f1.Computer

0    18
1    19
2    17
Name: Computer, dtype: int64

In [3]:
D = {'Vorudi':[97,97,97,98,98,98],
    'Vajeh':[18,17,16,15,16,17],
    'B1':[18,19,17,19,18,17]}
f2 = pd.DataFrame(D,index=[9712,9710,9718,9817,9816,9815])
f2

Unnamed: 0,Vorudi,Vajeh,B1
9712,97,18,18
9710,97,17,19
9718,97,16,17
9817,98,15,19
9816,98,16,18
9815,98,17,17


In [25]:
f2.B1

9712    18
9710    19
9718    17
9817    19
9816    18
9815    17
Name: B1, dtype: int64

In [27]:
f2.loc[9718]

Vorudi    97
Vajeh     16
B1        17
Name: 9718, dtype: int64

In [30]:
f2[0:2]

Unnamed: 0,Vorudi,Vajeh,B1
9712,97,18,18
9710,97,17,19


In [33]:
f2['B1']

9712    18
9710    19
9718    17
9817    19
9816    18
9815    17
Name: B1, dtype: int64

In [32]:
f2.iloc[0:2,0:2]

Unnamed: 0,Vorudi,Vajeh
9712,97,18
9710,97,17


## جلسه دوم

In [12]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [13]:
frame['x'] = np.arange(5)
frame

Unnamed: 0,state,year,pop,x
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4


In [14]:
# حذف یک ستون
del frame['x']
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [15]:
frame.values

array([['Ohio', 2000, 1.5],
       ['Ohio', 2001, 1.7],
       ['Ohio', 2002, 3.6],
       ['Nevada', 2001, 2.4],
       ['Nevada', 2002, 2.9]], dtype=object)

## Summarizing and Computing Descriptive Statistics

In [19]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
                index=['a', 'b', 'c', 'd'],
                columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [18]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [20]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [21]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [22]:
df.idxmax()

one    b
two    d
dtype: object

In [23]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


## GroupBy

In [1]:
import pandas as pd
import numpy as np
data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
   'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
   'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
   'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
   'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(data)
df

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
1,Riders,2,2015,789
2,Devils,2,2014,863
3,Devils,3,2015,673
4,Kings,3,2014,741
5,kings,4,2015,812
6,Kings,1,2016,756
7,Kings,1,2017,788
8,Riders,2,2016,694
9,Royals,4,2014,701


In [7]:
grouped = df.groupby('Year')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000277C46EA800>

In [8]:
for name,group in grouped:
   print (name)
   print (group)

2014
     Team  Rank  Year  Points
0  Riders     1  2014     876
2  Devils     2  2014     863
4   Kings     3  2014     741
9  Royals     4  2014     701
2015
      Team  Rank  Year  Points
1   Riders     2  2015     789
3   Devils     3  2015     673
5    kings     4  2015     812
10  Royals     1  2015     804
2016
     Team  Rank  Year  Points
6   Kings     1  2016     756
8  Riders     2  2016     694
2017
      Team  Rank  Year  Points
7    Kings     1  2017     788
11  Riders     2  2017     690


### Select a Group

In [9]:
grouped.get_group(2014)

Unnamed: 0,Team,Rank,Year,Points
0,Riders,1,2014,876
2,Devils,2,2014,863
4,Kings,3,2014,741
9,Royals,4,2014,701


### Aggregations¶

In [11]:
grouped['Points'].agg(np.mean)


Year
2014    795.25
2015    769.50
2016    725.00
2017    739.00
Name: Points, dtype: float64

In [32]:
grouped2 = df.groupby('Team')
grouped2['Points'].agg(np.size)

Team
Devils    2
Kings     3
Riders    4
Royals    2
kings     1
Name: Points, dtype: int64

In [33]:
grouped2['Points'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,1536,768.0,134.350288
Kings,2285,761.666667,24.006943
Riders,3049,762.25,88.567771
Royals,1505,752.5,72.831998
kings,812,812.0,


## Pandas - IO Tools

### Reading files

In [2]:
df=pd.read_csv("Datasets/temp.csv")
df

Unnamed: 0,S.No,Name,Age,City,Salary
0,1,Tom,28,Toronto,20000
1,2,Lee,32,HongKong,3000
2,3,Steven,43,Bay Area,8300
3,4,Ram,38,Hyderabad,3900


In [3]:
df=pd.read_csv("Datasets/temp.csv",index_col=['S.No'])
df

Unnamed: 0_level_0,Name,Age,City,Salary
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Tom,28,Toronto,20000
2,Lee,32,HongKong,3000
3,Steven,43,Bay Area,8300
4,Ram,38,Hyderabad,3900


In [13]:
df=pd.read_excel("Datasets/temp.xlsx")
df

Unnamed: 0,S.No,Name,Age,City,Salary
0,1,Tom,28,Toronto,20000
1,2,Lee,32,HongKong,3000
2,3,Steven,43,Bay Area,8300
3,4,Ram,38,Hyderabad,3900


In [9]:
df2=pd.read_excel("Datasets/temp_per.xlsx")
df2

Unnamed: 0,نام,نام خانوادگی,ورودی,معدل
0,علی,احمدی,911,15.64
1,حسین,روستا,921,14.43
2,فاطمه,رمضانی,931,16.0


In [10]:
df2=pd.read_csv("Datasets/temp_per.txt",sep='\t')
df2

Unnamed: 0,نام,نام خانوادگی,ورودی,معدل
0,علی,احمدی,911,15.64
1,حسین,روستا,921,14.43
2,فاطمه,رمضانی,931,16.0


### Writing files

In [14]:
df

Unnamed: 0,S.No,Name,Age,City,Salary
0,1,Tom,28,Toronto,20000
1,2,Lee,32,HongKong,3000
2,3,Steven,43,Bay Area,8300
3,4,Ram,38,Hyderabad,3900


In [15]:
df.to_csv('Datasets/temp1.csv',index=False)

In [16]:
df2.to_csv('Datasets/temp1_per.txt',index=False,sep='\t')

In [None]:
df2.to_excel('Datasets/temp1_per.xlsx',index=False)