# what is pandas?

pandas is an open source ,which is used to data structures and data manipulation tools designed to make data cleaning and analysis fast and easy in Python.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from pandas import Series,DataFrame

# introduction to pandas data structure

# Series

A Series is a one-dimensional array-like object containing a sequence of values (of
similar types to NumPy types) and an associated array of data labels, called its index.

In [3]:
S1 = pd.Series([3,6,8,9,7,9])
S1

0    3
1    6
2    8
3    9
4    7
5    9
dtype: int64

In [4]:
S1.values

array([3, 6, 8, 9, 7, 9], dtype=int64)

In [5]:
S1.index # like range(6)

RangeIndex(start=0, stop=6, step=1)

In [6]:
s2 = pd.Series([4,5,6,7,8],index = ["a","b","c","d","e"])
s2

a    4
b    5
c    6
d    7
e    8
dtype: int64

In [7]:
s2.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [8]:
s2.values

array([4, 5, 6, 7, 8], dtype=int64)

In [9]:
s2["d"]

7

In [10]:
s2["a"] = 10
s2

a    10
b     5
c     6
d     7
e     8
dtype: int64

Using NumPy functions or NumPy-like operations, such as filtering with a boolean
array, scalar multiplication, or applying math functions, will preserve the index-value

In [11]:
s2[s2>5]

a    10
c     6
d     7
e     8
dtype: int64

In [12]:
s2 * 2

a    20
b    10
c    12
d    14
e    16
dtype: int64

In [13]:
np.exp(s2)

a    22026.465795
b      148.413159
c      403.428793
d     1096.633158
e     2980.957987
dtype: float64

In [14]:
"a" in s2

True

In [15]:
"h" in s2

False

In [16]:
10 in s2

False

In [17]:
# when data contain in python dict
sdata = {"dg" : 501,"NA": 507,"RA":503}
data1= pd.Series(sdata) # here keys act like index
data1

dg    501
NA    507
RA    503
dtype: int64

In [18]:
data = ["de","dg","RA"]
data2 = pd.Series(sdata,index= data) # NAN means not a  number
data2

de      NaN
dg    501.0
RA    503.0
dtype: float64

In [19]:
#isnull and notnull functions in pandas should be used to detect missing data
pd.isnull(data2) # data2.isnull()

de     True
dg    False
RA    False
dtype: bool

In [20]:
pd.notnull(data2)

de    False
dg     True
RA     True
dtype: bool

In [21]:
data3 = data1 + data2
data3

NA       NaN
RA    1006.0
de       NaN
dg    1002.0
dtype: float64

In [22]:
data3.name = "subject"

In [23]:
data3.index.name = "course title"

In [24]:
data3

course title
NA       NaN
RA    1006.0
de       NaN
dg    1002.0
Name: subject, dtype: float64

# DataFrame

A DataFrame represents a rectangular table of data and contains an ordered collection
of columns, each of which can be a different value type (numeric, string,
boolean, etc.). The DataFrame has both a row and column index.

In [25]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])

if we pass 2 lists in a list pandas will consider single list a row and generate indexes rows and columns automatically for the DataFrame.

In [26]:
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [53]:
data4 = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3]}

Note::
1. The key will become column name and value becomes series.
2. The column attribute will sort the particular DataFrame column in particular Order.
3. Index  attribute will generate index manually.

In [54]:
frame = pd.DataFrame(data4,index = [10,20,30,40,50,60])

In [55]:
frame

Unnamed: 0,state,year,pop
10,Ohio,2000,1.5
20,Ohio,2001,1.7
30,Ohio,2002,3.6
40,Nevada,2001,2.4
50,Nevada,2002,2.9
60,Nevada,2003,3.0


In [56]:
# column values will be modified by assignment.
frame["pop"] = 5.6
frame

Unnamed: 0,state,year,pop
10,Ohio,2000,5.6
20,Ohio,2001,5.6
30,Ohio,2002,5.6
40,Nevada,2001,5.6
50,Nevada,2002,5.6
60,Nevada,2003,5.6


In [30]:
frame.head() # the head method selects only the first five rows:

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [31]:
# if you want to specfipy a sequence of column
frame1 = pd.DataFrame(data4,columns= ["year","state","pop","debt"],index= ["one","two","three","four","five","six"])
frame1

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.0,


In [32]:
frame1.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [33]:
frame1.year # frame1["year"]

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [34]:
#Rows can also be (retrieved)mean acess by position or name with the special loc attribute
frame1.loc["three"]

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [35]:
# columns can also be modify by assignment.
frame1["debt"] = 19
frame1


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,19
two,2001,Ohio,1.7,19
three,2002,Ohio,3.6,19
four,2001,Nevada,2.4,19
five,2002,Nevada,2.9,19
six,2003,Nevada,3.0,19


In [36]:
frame1["debt"] = np.arange(6.) # assigning a column that doesmot exist will create a new column.

In [37]:
frame1

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.0,5.0


In [57]:
frame1.values # in DataFrame it will return the values in 2D.

array([[2000, 'Ohio', 1.5, 0.0],
       [2001, 'Ohio', 1.7, 1.0],
       [2002, 'Ohio', 3.6, 2.0],
       [2001, 'Nevada', 2.4, 3.0],
       [2002, 'Nevada', 2.9, 4.0],
       [2003, 'Nevada', 3.0, 5.0]], dtype=object)

In [38]:
data5 = {"student":["liba","arman","kinat","mahnoor","hifza"],
        "midterm":[89,90,79,70,80],
        "final":[90,78,67,88,86]}

In [39]:
frame2 =pd.DataFrame(data5)

In [40]:
frame2

Unnamed: 0,student,midterm,final
0,liba,89,90
1,arman,90,78
2,kinat,79,67
3,mahnoor,70,88
4,hifza,80,86


In [41]:
diff = frame2.midterm - frame2.final
diff

0    -1
1    12
2    12
3   -18
4    -6
dtype: int64

In [42]:
frame3 = pd.DataFrame(frame2,columns = ["student","midterm","final","diff"])
frame3

Unnamed: 0,student,midterm,final,diff
0,liba,89,90,
1,arman,90,78,
2,kinat,79,67,
3,mahnoor,70,88,
4,hifza,80,86,


In [43]:
diff.mean()

-0.2

In [44]:
frame3["diff"] =  frame2.midterm - frame2.final
frame3

Unnamed: 0,student,midterm,final,diff
0,liba,89,90,-1
1,arman,90,78,12
2,kinat,79,67,12
3,mahnoor,70,88,-18
4,hifza,80,86,-6


In [45]:
diff.mode()

0    12
dtype: int64

In [46]:
diff.median()

-1.0