Pandas:
--------------
- Pandas is a built in library usring for data analysis. Pandas is used heavily for data manipulation, visualization, building machine learning models etc.
- Pandas implements a number of powerful data operations familiar to users of both database framworks and spreadsheet programs.
- The two main data structures in pandas are - Series and DataFrames.
- the default way to store the data is dataframes, and thus manipulating dataframes quickly is possible for data analysis.

In [1]:
import pandas as pd
! pip install pandas

In [2]:
pd.__version__

'0.23.4'

Pandas Series
---------------------
- A series is similar to 1D numpy array, and contains values of the same type
- creating series
    - List
    - Tuple
    - Dictionary
    - Numpy
    - Date_Range

In [3]:
# creating pandas series using list
k = [12,45,67,89,23,40]
s1 = pd.Series(k)
s1

0    12
1    45
2    67
3    89
4    23
5    40
dtype: int64

In [4]:
# creating pandas series using tuple
t = (3,4,7,8,123)
s2 = pd. Series(t)
s2

0      3
1      4
2      7
3      8
4    123
dtype: int64

In [5]:
di = {"a":12,"45":"b","c":35}
s3 = pd.Series(di)
s3

a     12
45     b
c     35
dtype: object

In [6]:
s3.index = [67,25,89]
s3

67    12
25     b
89    35
dtype: object

In [7]:
s3[89]

35

In [9]:
import numpy as np
s1.index = np.arange(30,36)

In [10]:
s1

30    12
31    45
32    67
33    89
34    23
35    40
dtype: int64

In [11]:
s4 = pd.Series(t,index = ["a","b","c","d","e"])
s4

a      3
b      4
c      7
d      8
e    123
dtype: int64

In [12]:
s5 = pd.Series("APSSDC",index = [1,2,3,4,5,6,7,89,78])
s5

1     APSSDC
2     APSSDC
3     APSSDC
4     APSSDC
5     APSSDC
6     APSSDC
7     APSSDC
89    APSSDC
78    APSSDC
dtype: object

In [13]:
# Date Range method
dates = pd.date_range(start = "2021-05-10",end = "2021-05-12")
dates

DatetimeIndex(['2021-05-10', '2021-05-11', '2021-05-12'], dtype='datetime64[ns]', freq='D')

In [14]:
dates = pd.date_range(start = "2021-05-10",end = "2021-05-12", closed = "left")
dates

DatetimeIndex(['2021-05-10', '2021-05-11'], dtype='datetime64[ns]', freq='D')

In [15]:
dates = pd.date_range(start = "2021-05-10",periods = 3)
dates

DatetimeIndex(['2021-05-10', '2021-05-11', '2021-05-12'], dtype='datetime64[ns]', freq='D')

In [16]:
help(pd.date_range)

Help on function date_range in module pandas.core.indexes.datetimes:

date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)
    Return a fixed frequency DatetimeIndex.
    
    Parameters
    ----------
    start : str or datetime-like, optional
        Left bound for generating dates.
    end : str or datetime-like, optional
        Right bound for generating dates.
    periods : integer, optional
        Number of periods to generate.
    freq : str or DateOffset, default 'D' (calendar daily)
        Frequency strings can have multiples, e.g. '5H'. See
        :ref:`here <timeseries.offset_aliases>` for a list of
        frequency aliases.
    tz : str or tzinfo, optional
        Time zone name for returning localized DatetimeIndex, for example
        'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
        timezone-naive.
    normalize : bool, default False
        Normalize start/end dates to midnight befo

In [17]:
s = pd.Series(np.arange(11,20),index = np.arange(1,10))
s

1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

In [19]:
s[1:7:2]

2    12
4    14
6    16
dtype: int32

In [22]:
s[[1,4,6,9]] # fancy indexing

1    11
4    14
6    16
9    19
dtype: int32

In [23]:
# creating Data frames
li = [12,13,14,15]
df1 = pd.DataFrame(li)
df1

Unnamed: 0,0
0,12
1,13
2,14
3,15


In [24]:
li1 = [[1,2,3],[4,5,6],[7,8,9]]
df2 = pd.DataFrame(li1)
df2

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [25]:
df2.index = ["a","h","d"]
df2

Unnamed: 0,0,1,2
a,1,2,3
h,4,5,6
d,7,8,9


In [26]:
df2.columns = [123,456,789]
df2

Unnamed: 0,123,456,789
a,1,2,3
h,4,5,6
d,7,8,9


In [28]:
pd.concat([df2,df2], axis = 0) # row -- 0 , column -- 1

Unnamed: 0,123,456,789
a,1,2,3
h,4,5,6
d,7,8,9
a,1,2,3
h,4,5,6
d,7,8,9


In [29]:
df2.append(df2)

Unnamed: 0,123,456,789
a,1,2,3
h,4,5,6
d,7,8,9
a,1,2,3
h,4,5,6
d,7,8,9


In [30]:
pd.merge(df2,df2)

Unnamed: 0,123,456,789
0,1,2,3
1,4,5,6
2,7,8,9


In [31]:
df1 = pd.DataFrame({"a":pd.Series([1,2,3,4],index = [1,2,3,4]),
                   "b":pd.Series([2000,2001,2002,2003],index = [1,2,3,4])})
df1

Unnamed: 0,a,b
1,1,2000
2,2,2001
3,3,2002
4,4,2003


In [32]:
df2 = pd.DataFrame({"a":pd.Series([1,2,3,4],index = [1,2,3,4]),
                   "c":pd.Series(["ec","hr","sl","tr"],index = [1,2,3,4])})
df2

Unnamed: 0,a,c
1,1,ec
2,2,hr
3,3,sl
4,4,tr


In [33]:
pd.merge(df1,df2)

Unnamed: 0,a,b,c
0,1,2000,ec
1,2,2001,hr
2,3,2002,sl
3,4,2003,tr


In [34]:
bird  = pd.read_csv("https://raw.githubusercontent.com/ikkurthipoojitha/Data-Analysis-using-Python/main/Datasets/birds.csv")
bird.head()

Unnamed: 0,id,huml,humw,ulnal,ulnaw,feml,femw,tibl,tibw,tarl,tarw,type
0,0,80.78,6.68,72.01,4.88,41.81,3.7,5.5,4.03,38.7,3.84,SW
1,1,88.91,6.63,80.53,5.59,47.04,4.3,80.22,4.51,41.5,4.01,SW
2,2,79.97,6.37,69.26,5.28,43.07,3.9,75.35,4.04,38.31,3.34,SW
3,3,77.65,5.7,65.76,4.77,40.04,3.52,69.17,3.4,35.78,3.41,SW
4,4,62.8,4.84,52.09,3.73,33.95,2.72,56.27,2.96,31.88,3.13,SW


In [35]:
# bird = pd.read_csv("birds.csv")
bird.tail()

Unnamed: 0,id,huml,humw,ulnal,ulnaw,feml,femw,tibl,tibw,tarl,tarw,type
415,415,17.96,1.63,19.25,1.33,18.36,1.54,31.25,1.33,21.99,1.15,SO
416,416,19.21,1.64,20.76,1.49,19.24,1.45,33.21,1.28,23.6,1.15,SO
417,417,18.79,1.63,19.83,1.53,20.96,1.43,34.45,1.41,22.86,1.21,SO
418,418,20.38,1.78,22.53,1.5,21.35,1.48,36.09,1.53,25.98,1.24,SO
419,419,17.89,1.44,19.26,1.1,17.62,1.34,29.81,1.24,21.69,1.05,SO


In [36]:
bird.shape

(420, 12)

In [37]:
bird.sample(3)

Unnamed: 0,id,huml,humw,ulnal,ulnaw,feml,femw,tibl,tibw,tarl,tarw,type
103,103,68.5,3.77,71.15,3.34,24.86,1.95,51.44,2.1,30.13,2.37,SW
380,380,24.84,2.32,28.88,2.01,25.65,2.01,40.2,1.83,26.66,1.65,SO
404,404,20.36,1.87,22.19,1.6,,1.77,37.47,1.64,25.54,1.34,SO


In [38]:
bird.describe()

Unnamed: 0,id,huml,humw,ulnal,ulnaw,feml,femw,tibl,tibw,tarl,tarw
count,420.0,419.0,419.0,417.0,418.0,418.0,419.0,418.0,419.0,419.0,419.0
mean,209.5,64.650501,4.370573,69.115372,3.597249,36.872416,3.220883,64.662823,3.182339,39.229976,2.930024
std,121.387808,53.834549,2.854617,58.784775,2.186747,19.979082,2.023581,37.838145,2.080827,23.184313,2.185673
min,0.0,9.85,1.14,14.09,1.0,11.83,0.93,5.5,0.87,7.77,0.66
25%,104.75,25.17,2.19,28.05,1.87,21.2975,1.715,36.4175,1.565,23.035,1.425
50%,209.5,44.18,3.5,43.71,2.945,31.13,2.52,52.12,2.49,31.74,2.23
75%,314.25,90.31,5.81,97.52,4.77,47.12,4.135,82.87,4.255,50.25,3.5
max,419.0,420.0,17.84,422.0,12.0,117.07,11.64,240.0,11.03,175.0,14.09


In [40]:
bird.index

RangeIndex(start=0, stop=420, step=1)

In [41]:
bird.columns

Index(['id', 'huml', 'humw', 'ulnal', 'ulnaw', 'feml', 'femw', 'tibl', 'tibw',
       'tarl', 'tarw', 'type'],
      dtype='object')

In [44]:
bird[["huml","humw","tarw"]].head()

Unnamed: 0,huml,humw,tarw
0,80.78,6.68,3.84
1,88.91,6.63,4.01
2,79.97,6.37,3.34
3,77.65,5.7,3.41
4,62.8,4.84,3.13


In [45]:
bird[5:20:4]

Unnamed: 0,id,huml,humw,ulnal,ulnaw,feml,femw,tibl,tibw,tarl,tarw,type
5,5,61.92,4.78,50.46,3.47,49.52,4.41,56.95,2.73,29.07,2.83,SW
9,9,145.0,10.42,144.0,7.05,70.96,7.44,120.0,7.31,78.67,6.34,SW
13,13,148.91,6.78,121.35,6.5,44.29,6.33,155.0,6.68,73.71,4.24,SW
17,17,124.53,6.01,100.37,5.82,34.54,5.43,145.0,7.12,65.92,3.28,SW


In [48]:
bird[10:11]['huml']

10    165.0
Name: huml, dtype: float64

In [49]:
bird['huml'][10]

165.0

In [50]:
bird.iloc[10]

id          10
huml       165
humw     11.45
ulnal      156
ulnaw     8.68
feml     80.25
femw      7.85
tibl       143
tibw      8.25
tarl     86.61
tarw      6.63
type        SW
Name: 10, dtype: object

In [51]:
bird.loc[10]

id          10
huml       165
humw     11.45
ulnal      156
ulnaw     8.68
feml     80.25
femw      7.85
tibl       143
tibw      8.25
tarl     86.61
tarw      6.63
type        SW
Name: 10, dtype: object

In [52]:
bird.loc[120,'huml']

118.52

In [53]:
bird.loc[10:20,"huml"]

10    165.00
11    186.00
12    172.00
13    148.91
14    149.19
15    140.59
16    135.23
17    124.53
18    127.03
19    106.02
20    113.84
Name: huml, dtype: float64

In [54]:
bird.loc[100:110,["huml","humw"]]

Unnamed: 0,huml,humw
100,67.06,3.54
101,67.59,3.73
102,100.27,5.75
103,68.5,3.77
104,68.15,3.84
105,107.76,4.71
106,109.7,4.65
107,100.69,4.8
108,100.08,5.07
109,70.04,4.74


In [56]:
bird.count()

id       420
huml     419
humw     419
ulnal    417
ulnaw    418
feml     418
femw     419
tibl     418
tibw     419
tarl     419
tarw     419
type     420
dtype: int64

In [57]:
bird["tarw"].min()

0.66

In [58]:
bird["tarw"].max()

14.09

In [59]:
bird["huml"] > 90

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8       True
9       True
10      True
11      True
12      True
13      True
14      True
15      True
16      True
17      True
18      True
19      True
20      True
21      True
22      True
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
390    False
391    False
392    False
393    False
394    False
395    False
396    False
397    False
398    False
399    False
400    False
401    False
402    False
403    False
404    False
405    False
406    False
407    False
408    False
409    False
410    False
411    False
412    False
413    False
414    False
415    False
416    False
417    False
418    False
419    False
Name: huml, Length: 420, dtype: bool

In [62]:
bird[(bird["huml"]>90) & (bird["huml"]<150)]

Unnamed: 0,id,huml,humw,ulnal,ulnaw,feml,femw,tibl,tibw,tarl,tarw,type
8,8,118.20,7.82,116.64,6.13,59.33,5.45,110.00,5.58,61.62,4.37,SW
9,9,145.00,10.42,144.00,7.05,70.96,7.44,120.00,7.31,78.67,6.34,SW
13,13,148.91,6.78,121.35,6.50,44.29,6.33,155.00,6.68,73.71,4.24,SW
14,14,149.19,6.98,121.48,6.21,43.33,5.39,155.00,6.61,75.65,3.36,SW
15,15,140.59,6.59,115.97,5.81,39.75,5.61,166.00,7.83,74.86,3.52,SW
16,16,135.23,6.22,108.78,5.83,37.99,5.13,157.00,7.18,72.44,3.53,SW
17,17,124.53,6.01,100.37,5.82,34.54,5.43,145.00,7.12,65.92,3.28,SW
18,18,127.03,6.18,102.53,5.64,36.52,4.89,150.00,6.78,67.27,3.06,SW
19,19,106.02,4.47,95.79,4.33,41.96,4.80,129.59,6.94,70.29,3.35,SW
20,20,113.84,4.80,100.71,4.39,43.95,5.08,133.27,7.41,70.91,2.69,SW


In [64]:
bird[bird["type"] == "P"].shape

(38, 12)