# Pandas Library
**Pandas** is a fast, powerful, flexible and easy to use open source data analysis and manipulation library,
built on top of the Python programming language.

## Series
**Series** is a data type is very similar to a NumPy array, in fact, it is build on top of the NumPy array object. What makes Pandas Series different is that they can have *AXIS LABELS*, meaning it can be indexed by a label instead of numeric data.

In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.Series([10,20,30])

0    10
1    20
2    30
dtype: int64

In [3]:
labels = ['a','b','c']
my_list = [10,20,30]
dic = {'a':10, 'b':20, 'c':30}

In [4]:
pd.Series(data=my_list,index=labels)

a    10
b    20
c    30
dtype: int64

In [5]:
pd.Series(my_list, labels)

a    10
b    20
c    30
dtype: int64

In [6]:
pd.Series(dic)

a    10
b    20
c    30
dtype: int64

## Using an Index
The key to using a Series is understanding its index. Panda makes use of these index names by allowing for fast lookups of information.

In [7]:
exam1 = pd.Series([90,85,92,75], index=['Mary', 'Roy', 'Pedro', 'Anna'])
exam2 = pd.Series([65,90,90,100], index=['Mary', 'Roy', 'Pedro', 'Anna'])

In [8]:
exam1

Mary     90
Roy      85
Pedro    92
Anna     75
dtype: int64

In [9]:
exam2

Mary      65
Roy       90
Pedro     90
Anna     100
dtype: int64

In [10]:
exam1+exam2

Mary     155
Roy      175
Pedro    182
Anna     175
dtype: int64

In [11]:
average = (exam1+exam2)/2
average

Mary     77.5
Roy      87.5
Pedro    91.0
Anna     87.5
dtype: float64

In [12]:
average['Anna']

87.5

## DataFrame
**Pandas DataFrame** is two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns. Pandas DataFrame consists of three principal components, the **_data_**, **_rows_**, and **_columns_**.
In the real world, a **Pandas DataFrame** will be created by loading the datasets from existing files.

You can create a DataFrame from lists or dictionaries.

In [13]:
import pandas as pd
lst = ['Mary', 'Roy', 'Pedro', 'Anna']
df = pd.DataFrame(lst)

In [14]:
df

Unnamed: 0,0
0,Mary
1,Roy
2,Pedro
3,Anna


In [15]:
grades = {'Name': ['Mary','Roy','Pedro','Anna'],
         'Grade': [90,80,90,100]}

In [16]:
grades_df = pd.DataFrame(grades)
grades_df

Unnamed: 0,Name,Grade
0,Mary,90
1,Roy,80
2,Pedro,90
3,Anna,100


In [17]:
grades = {'Name': ['Mary','Roy','Pedro','Anna'],
         'exam1': [90,80,90,100],
         'exam2': [80,93,75,85]}

In [18]:
all_grades = pd.DataFrame(grades)
all_grades

Unnamed: 0,Name,exam1,exam2
0,Mary,90,80
1,Roy,80,93
2,Pedro,90,75
3,Anna,100,85


## Reading from a csv file into a DataFrame

In [19]:
bb_data = pd.read_csv("Baseball.csv", index_col ="name")

In [20]:
lee=bb_data.loc["C Lee"]
lee

Unnamed: 0_level_0,team,position,game,at_bat,run,hit,double,triple,home_run,rbi,total_base,walk,strike_out,stolen_base,caught_stealing,obp,slg,bat_avg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
C Lee,HOU,OF,157,605,67,149,29,1,24,89,252,37,59,3,3,0.291,0.417,0.246
C Lee,SEA,P,1,3,0,0,0,0,0,0,0,0,2,0,0,0.0,0.0,0.0


In [21]:
P_One = bb_data.loc["A Gonzalez"]
P_One[['team', 'home_run', 'obp']]

Unnamed: 0_level_0,team,home_run,obp
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A Gonzalez,SD,31,0.393
A Gonzalez,TOR,17,0.296
A Gonzalez,ATL,6,0.291
A Gonzalez,WSH,0,0.277


## Performing Descriptive Statistics with Pandas

In [22]:
import pandas as pd
cars = pd.read_csv("cars.csv", index_col="name")

In [23]:
cars.describe(include='all')

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,20.090625,6.1875,230.721875,146.6875,3.596563,3.21725,17.84875,0.4375,0.40625,3.6875,2.8125
std,6.026948,1.785922,123.938694,68.562868,0.534679,0.978457,1.786943,0.504016,0.498991,0.737804,1.6152
min,10.4,4.0,71.1,52.0,2.76,1.513,14.5,0.0,0.0,3.0,1.0
25%,15.425,4.0,120.825,96.5,3.08,2.58125,16.8925,0.0,0.0,3.0,2.0
50%,19.2,6.0,196.3,123.0,3.695,3.325,17.71,0.0,0.0,4.0,2.0
75%,22.8,8.0,326.0,180.0,3.92,3.61,18.9,1.0,1.0,4.0,4.0
max,33.9,8.0,472.0,335.0,4.93,5.424,22.9,1.0,1.0,5.0,8.0


In [24]:
cars['mpg'].describe()

count    32.000000
mean     20.090625
std       6.026948
min      10.400000
25%      15.425000
50%      19.200000
75%      22.800000
max      33.900000
Name: mpg, dtype: float64

## Individual Stats

In [25]:
cars['mpg'].mean()

20.090624999999996

In [26]:
cars['mpg'].std()

6.026948052089105

## Running a correlation matrix

In [27]:
cars.corr()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
mpg,1.0,-0.852162,-0.847551,-0.776168,0.681172,-0.867659,0.418684,0.664039,0.599832,0.480285,-0.550925
cyl,-0.852162,1.0,0.902033,0.832447,-0.699938,0.782496,-0.591242,-0.810812,-0.522607,-0.492687,0.526988
disp,-0.847551,0.902033,1.0,0.790949,-0.710214,0.88798,-0.433698,-0.710416,-0.591227,-0.555569,0.394977
hp,-0.776168,0.832447,0.790949,1.0,-0.448759,0.658748,-0.708223,-0.723097,-0.243204,-0.125704,0.749812
drat,0.681172,-0.699938,-0.710214,-0.448759,1.0,-0.712441,0.091205,0.440278,0.712711,0.69961,-0.09079
wt,-0.867659,0.782496,0.88798,0.658748,-0.712441,1.0,-0.174716,-0.554916,-0.692495,-0.583287,0.427606
qsec,0.418684,-0.591242,-0.433698,-0.708223,0.091205,-0.174716,1.0,0.744535,-0.229861,-0.212682,-0.656249
vs,0.664039,-0.810812,-0.710416,-0.723097,0.440278,-0.554916,0.744535,1.0,0.168345,0.206023,-0.569607
am,0.599832,-0.522607,-0.591227,-0.243204,0.712711,-0.692495,-0.229861,0.168345,1.0,0.794059,0.057534
gear,0.480285,-0.492687,-0.555569,-0.125704,0.69961,-0.583287,-0.212682,0.206023,0.794059,1.0,0.274073


## DataFrame Column looping

In [28]:
for col in cars:
    print(cars[col])

name
Mazda RX4              21.0
Mazda RX4 Wag          21.0
Datsun 710             22.8
Hornet 4 Drive         21.4
Hornet Sportabout      18.7
Valiant                18.1
Duster 360             14.3
Merc 240D              24.4
Merc 230               22.8
Merc 280               19.2
Merc 280C              17.8
Merc 450SE             16.4
Merc 450SL             17.3
Merc 450SLC            15.2
Cadillac Fleetwood     10.4
Lincoln Continental    10.4
Chrysler Imperial      14.7
Fiat 128               32.4
Honda Civic            30.4
Toyota Corolla         33.9
Toyota Corona          21.5
Dodge Challenger       15.5
AMC Javelin            15.2
Camaro Z28             13.3
Pontiac Firebird       19.2
Fiat X1-9              27.3
Porsche 914-2          26.0
Lotus Europa           30.4
Ford Pantera L         15.8
Ferrari Dino           19.7
Maserati Bora          15.0
Volvo 142E             21.4
Name: mpg, dtype: float64
name
Mazda RX4              6
Mazda RX4 Wag          6
Datsun 710        

## Normalizing DataFrame data

In [29]:
cars_std = cars.copy()
for col in cars_std:
    cars_std[col] = (cars_std[col] - cars_std[col].mean()) / cars_std[col].std()
  
    

In [30]:
cars_std[:10]

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Mazda RX4,0.150885,-0.104988,-0.57062,-0.535093,0.567514,-0.6104,-0.777165,-0.868028,1.189901,0.423554,0.735203
Mazda RX4 Wag,0.150885,-0.104988,-0.57062,-0.535093,0.567514,-0.349785,-0.463781,-0.868028,1.189901,0.423554,0.735203
Datsun 710,0.449543,-1.224858,-0.990182,-0.78304,0.474,-0.917005,0.426007,1.116036,1.189901,0.423554,-1.122152
Hornet 4 Drive,0.217253,-0.104988,0.220094,-0.535093,-0.966118,-0.0023,0.890487,1.116036,-0.814143,-0.931819,-1.122152
Hornet Sportabout,-0.230735,1.014882,1.043081,0.412942,-0.835198,0.227654,-0.463781,-0.868028,-0.814143,-0.931819,-0.503034
Valiant,-0.330287,-0.104988,-0.046167,-0.608019,-1.564608,0.248095,1.326987,1.116036,-0.814143,-0.931819,-1.122152
Duster 360,-0.960789,1.014882,1.043081,1.433903,-0.722981,0.360516,-1.124126,-0.868028,-0.814143,-0.931819,0.735203
Merc 240D,0.715018,-1.224858,-0.677931,-1.23518,0.174754,-0.02785,1.203871,1.116036,-0.814143,0.423554,-0.503034
Merc 230,0.449543,-1.224858,-0.725535,-0.75387,0.604919,-0.068731,2.826755,1.116036,-0.814143,0.423554,-0.503034
Merc 280,-0.147774,-0.104988,-0.509299,-0.345486,0.604919,0.227654,0.252526,1.116036,-0.814143,0.423554,0.735203


In [31]:
cars_std.corr()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
mpg,1.0,-0.852162,-0.847551,-0.776168,0.681172,-0.867659,0.418684,0.664039,0.599832,0.480285,-0.550925
cyl,-0.852162,1.0,0.902033,0.832447,-0.699938,0.782496,-0.591242,-0.810812,-0.522607,-0.492687,0.526988
disp,-0.847551,0.902033,1.0,0.790949,-0.710214,0.88798,-0.433698,-0.710416,-0.591227,-0.555569,0.394977
hp,-0.776168,0.832447,0.790949,1.0,-0.448759,0.658748,-0.708223,-0.723097,-0.243204,-0.125704,0.749812
drat,0.681172,-0.699938,-0.710214,-0.448759,1.0,-0.712441,0.091205,0.440278,0.712711,0.69961,-0.09079
wt,-0.867659,0.782496,0.88798,0.658748,-0.712441,1.0,-0.174716,-0.554916,-0.692495,-0.583287,0.427606
qsec,0.418684,-0.591242,-0.433698,-0.708223,0.091205,-0.174716,1.0,0.744535,-0.229861,-0.212682,-0.656249
vs,0.664039,-0.810812,-0.710416,-0.723097,0.440278,-0.554916,0.744535,1.0,0.168345,0.206023,-0.569607
am,0.599832,-0.522607,-0.591227,-0.243204,0.712711,-0.692495,-0.229861,0.168345,1.0,0.794059,0.057534
gear,0.480285,-0.492687,-0.555569,-0.125704,0.69961,-0.583287,-0.212682,0.206023,0.794059,1.0,0.274073
