# Disclaimer: 
This is not a comprehensive notebook and it wasn't designed to be a comprehensive notebook.

This just serves as a quick outline/guide in using pandas. 

In [None]:
# You may also watch this YouTube video which I personally used to study pandas

# https://youtu.be/vmEHCJofslg

# Setting Up

In [None]:
# Install pandas if you don't have it yet
!pip install pandas 

In [None]:
# pandas library, documentation

# https://pandas.pydata.org/docs/user_guide/index.html#user-guide

# access this to read and understand the different functions in the pandas library

In [3]:
# Importing the library
import pandas as pd

In [None]:
# Load CSV or other files, and open it as a DataFrame
df = pd.read_csv("filename.csv") #specify delimiter if needed
# You might need a different function when loading xlsx or txt files...

df.head() #to show first 5 rows -- can input a number inside .head(n) to show first n numbers

#df.tail() to to show last 5 rows

In [None]:
# Take note that a DataFrame is like the "table" of your data
# A pandas series is like a "column" of data ("series" / "sequence" of data)

# Accessing Data

In [None]:
# Read headers (this list downs all the column headers as a series)
df.columns

In [None]:
# Access a column
df["column1"] # Similar to indexing in lists/dicts


In [None]:
# Access multiple columns by passing a list of columns instead of just a column header
df[["column1", "column2"]]

In [None]:
# Access a row using index (iloc = integer location)
df.iloc[0:2] #this is very similar to our slicing techniques

In [None]:
# You can mix both to specify column + row --> this is like specifying a column first, and then slicing the row
df["column1"]["row_start":"row_end"] #again, very similar to slicing

In [None]:
# You can access a specific data point using .iloc[row_num,col_num]
df.iloc[5,4] # this will give 5th row, 4th column 
# remember that indices start at 0

In [None]:
# You can access specific subsets of data using .loc()
df.loc["row_index", "col_name"]


# Filtering Data

In [None]:
# The .loc function can take an input of a Boolean Array (Boolean List) to return values
df = df.iloc[0:3]

df.loc[[True, False, True]] # This gets rows 1 and 3 since they're the ones that are True
# For this, you can also just do:        df[[True, False, True]]
# this is without the .loc() function


In [18]:
df

Unnamed: 0,OrderNum,Product
0,1000,carrot
1,1000,eggplant
2,1001,banana


In [20]:
# You can use logical operators (==, >=, <=, etc) as a shortcut to make a Boolean Array

df.loc[df["col1"]=="some_string"]

# The df.loc is not needed for this as this should also bring the same output

df[df["col1"]=="some_string"]

Unnamed: 0,OrderNum,Product
0,1000,carrot
1,1000,eggplant
2,1001,banana


In [None]:
# Quick reminder
# You can technically initialize a variable to "store the values of an existing DataFrame"
# but take note that you might need to use .copy() to 

new_df = old_df.copy()

# Some Built-in Functions and Vectorization

In [None]:
# Vectorization is when columns are seen as a "single unit"
# For us, what this means is that we don't need to iterate through each row within a column
# Rather, it allows us to execute our functions (calculations, data manipulations, etc) at a more efficient manner

In [None]:
# This is BAD and SLOW
peso_prices = []

for price in df['Price']:
    peso_prices.append(price * 55)

# Re-assign the Price column to update it
df['Price'] = peso_prices

In [None]:
# This is vectorized
df['Price'] = df['Price'] * 55

In [None]:
# This is also bad
capitalized_product_names = []

for product_name in df['Product']:
    capitalized_product_names.append(product_name.upper())

df['Product'] = capitalized_product_names

In [None]:
# This is better
df['Product'] = df['Product'].str.upper()

In [None]:
# Usually, there are functions or a combination of functions which can already 
# execute and manipulate the data in the way you want it to
# The challenge here is being able to know which one to use and how to use it properly

# Custom Functions - .apply()

In [None]:
# If there are no specific built-in functions yet, you can use the .apply() 
# paired together with Functions or Lambda Functions


In [None]:
# The Functions being referred to here are the functions we know normally

def Func(x):
    # some code here
    return x

def squared(x):
    return x**2

In [None]:
# But Lambda Functions are like one-off "shortcuts" in defining a function
# similar to how comprehensions can help make lists, dicts, tups, and sets efficiently

df["column1"].apply(lambda x: x**2)

![image.png](attachment:image.png)

# Final Comments

In [None]:
# For me personally, pandas is one of the most useful libraries for data analysis in Python
# The real challenge is the syntax that gets a bit of getting used to
# But, I am encouraging you to utilize all the resources available to you
# including but not limited to – the Notion link, the documentation website, 
# StackOverflow, ChatGPT, the YouTube video (Keith Galli), etc.

In [None]:
# https://pandas.pydata.org/docs/user_guide/index.html#user-guide
# https://youtu.be/vmEHCJofslg

# Dumping the pandas documentation and Keith Galli's YT video again hehe


In [None]:
# My personal pandas learning journey was mostly through watching Keith Galli's videos and applying it
# Afterwards, StackOverflow and the documentation website were my best friends in understanding everything
# At the end of the day, I still go back to the documentation frequently when using pandas
# AND THATS FINE! Coding is hard but that's why documentations and collaborative works exist!
# So just keep trying! You'll soon get this. I'm sure! ^^

In [None]:
# Good luck and God bless! ^^