<a href="https://colab.research.google.com/github/jugalpanchal/py-pilgrim/blob/main/pd_dt_route.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import numpy as np
import pandas as pd

### 1D Array

In [2]:
# 1D Array
np_array = np.random.rand(3) # type: float64
print(type(np_array))

print(np_array)

<class 'numpy.ndarray'>
[0.89666508 0.3929426  0.01574883]


### Series

In [3]:
# Pandas Series - contains labels
first_series = pd.Series(np_array) # class: Series class
print(type(first_series))

print(first_series)

<class 'pandas.core.series.Series'>
0    0.896665
1    0.392943
2    0.015749
dtype: float64


In [4]:
# Pandas Series - custom labels
second_series = pd.Series(np_array, index = ["First", "Second", "Third"])
print(type(second_series))

print(second_series)
print('\n', second_series[1]) # apply index
print('\n', second_series["Second"]) # apply custom index
print('\n', second_series.index)

<class 'pandas.core.series.Series'>
First     0.896665
Second    0.392943
Third     0.015749
dtype: float64

 0.3929426032295781

 0.3929426032295781

 Index(['First', 'Second', 'Third'], dtype='object')


### 2D Array

In [5]:
# 2D Array
np_2d_array = np.random.rand(3, 2)
print(type(np_2d_array))

print(np_2d_array)
print(np_2d_array[2, 1])

<class 'numpy.ndarray'>
[[0.7627858  0.4968698 ]
 [0.0304669  0.1558961 ]
 [0.67755936 0.60951667]]
0.6095166741349233


In [6]:
# Pandas DataFrame - along with row and col index/label
first_df = pd.DataFrame(np_2d_array) # class: DataFrame
print(type(first_df))

print(first_df) # prints values with index
# print(first_df[2,1]) # it fails :) - KeyError: (2, 1)
print(first_df.columns)
first_df.columns = ["First", "Second"]

print(first_df)
print(first_df["Second"]) # It returns a series now.
# print(first_df[1, "second"]) # it fails :) - KeyError: (1, 'second')
print(first_df['Second'].values[1]) # the values gives a numpy array.

<class 'pandas.core.frame.DataFrame'>
          0         1
0  0.762786  0.496870
1  0.030467  0.155896
2  0.677559  0.609517
RangeIndex(start=0, stop=2, step=1)
      First    Second
0  0.762786  0.496870
1  0.030467  0.155896
2  0.677559  0.609517
0    0.496870
1    0.155896
2    0.609517
Name: Second, dtype: float64
0.15589610281825028


### Indexing: iat, at, iloc, loc

In [7]:
# iat, at, iloc, loc
print('2nd row and 2nd col: ', first_df.iat[1, 1])
print('2nd row and 2nd col: ', first_df.at[1, "Second"])

print('\nColumns:')
print(first_df.iloc[1])

print('\n')
print('2nd row and 2nd col: ', first_df.iloc[1, 1])
print('2nd row and 2nd col: ', first_df.iloc[1]["Second"])

# loc(row index/range/Series), col index/range/Series/ColumnName) - by labels(Label is part of DataFrame, first column but it does not include in positon count)
# iloc(row index/range/Series, col index/range/Series/ColumnName) - by position(position starts with 0)

#index: 2
#range: [start:n] - n is number of data points.
#WildCard: [:] - it gives all rows/cols.
#Series: [3, 6, 8, 103]

2nd row and 2nd col:  0.15589610281825028
2nd row and 2nd col:  0.15589610281825028

Columns:
First     0.030467
Second    0.155896
Name: 1, dtype: float64


2nd row and 2nd col:  0.15589610281825028
2nd row and 2nd col:  0.15589610281825028


### Create a dataframe

In [26]:
records = [("Espresso", 5, 4),
           ("Brew", 4, 6),
           ("Flat White", 3, 7)]

df2 = pd.DataFrame.from_records(records)
print(df2)
print('\n')

df3 = pd.DataFrame.from_records(records, columns=["Coffee", "Price", "Qty"]) # Add columns
print(df3)

            0  1  2
0    Espresso  5  4
1        Brew  4  6
2  Flat White  3  7


       Coffee  Price  Qty
0    Espresso      5    4
1        Brew      4    6
2  Flat White      3    7


### Filter Columns

In [51]:
print(df3['Coffee'])
print('\n')

print(df3[['Coffee','Qty']])
print('\n')

df3.loc[df3['Coffee'] == 'Brew']

0      Espresso
1          Brew
2    Flat White
Name: Coffee, dtype: object


       Coffee  Qty
0    Espresso    4
1        Brew    6
2  Flat White    7




Unnamed: 0,Coffee,Price,Qty,Total
1,Brew,4,6,24


### New Column

In [27]:
total = df3["Price"] * df3["Qty"] # if value is not numeric then we can covert it if required.
df3 = df3.assign(Total = total) # create a new column
print(df3)

       Coffee  Price  Qty  Total
0    Espresso      5    4     20
1        Brew      4    6     24
2  Flat White      3    7     21


### Meth Methods

In [33]:
print('Max Value:            ', df3['Total'].max())

print('Max Value Label/Index:', df3['Total'].idxmax()) # It can be used further to help to get a entire row.


Max Value:             24
Max Value Label/Index: 1


### Replace anomalies to NaN



In [10]:
records = [("Espresso", "5"),
           ("Cappuccino", "5A"),
           ("Americano", "NaN"),
           ("Brew", "5D"),
           ("Flat White", "10")]
df4 = pd.DataFrame.from_records(records, columns=["Coffee", "Price"])
print(df4)
print('\n')

df4.loc[:, 'Price'] = pd.to_numeric(df4['Price'], errors='coerce') # replace no valid data to NaN and convert str to numeric
print(df4)

       Coffee Price
0    Espresso     5
1  Cappuccino    5A
2   Americano   NaN
3        Brew    5D
4  Flat White    10


       Coffee  Price
0    Espresso    5.0
1  Cappuccino    NaN
2   Americano    NaN
3        Brew    NaN
4  Flat White   10.0
