## Data Manipulation with Pandas
Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame. DataFrames are essentially multidimensional arrays with attached row and column labels, and often with heterogeneous types and/or missing data.

In [4]:
import numpy as np
import pandas as pd
pd.__version__   

'2.2.1'

### Pandas's objects

1. Pandas series
1. Pandas data frame

In [3]:
data = pd.Series([.25,.5,.75,.1])
data

0    0.25
1    0.50
2    0.75
3    0.10
dtype: float64

In [7]:
data = pd.Series(np.linspace(start=0.25,stop=1,num=4))
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [10]:
# Values an index attributes

print(f'Values: {data.values}')
print(f'Index: {data.index}')

Values: [0.25 0.5  0.75 1.  ]
Index: RangeIndex(start=0, stop=4, step=1)


In [12]:
# Accesing by index

print(data)
print(data[1])

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
0.5


In [13]:
# Defining index

data = pd.Series(np.linspace(.25,1,4),
                 index= ['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [17]:
# Series could be thinking like as specialized dictionary

population_dic = {'California': 38332521,
                  'Texas': 26448193,
                  'New York': 19651127,
                  'Florida': 19552860,
                  'Illinois': 12882135}
population = pd.Series(population_dic)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [18]:
population['California']

38332521

In [19]:
population['California':'Illinois'] # Slicing

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [22]:
# Constructiong series objects without dictionaries

pd.Series(data = 5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

### Pandas Data Frame Object

In [23]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [24]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [28]:
# Index and columns attributes

print(f'Index attribute: {states.index} \n')
print(f'Columns attribute: {states.columns}')

Index attribute: Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object') 

Columns attribute: Index(['population', 'area'], dtype='object')


In [29]:
# Accesing by columns
states['area']


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [30]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135
