#########################################
# Pandas – Creating DataFrames
#########################################


In [None]:
# bring the functionality of pandas in
import pandas as pd

In [None]:
# create a dataframe object
my_df = pd.DataFrame()
print(my_df)

In [None]:
# populate the dataframe with data
my_df = pd.DataFrame({"Name": ["Tom", "Dick", "Harry"]})
my_df

In [None]:
# add another field
my_df = pd.DataFrame({"Name":["Tom","Dick","Harry"],
                      "ID":[101,102,103]})
print(my_df)

In [None]:
# create a dataframe with the help of a list
my_list = [["Tom", 101], ["Dick", 102], ["Harry", 103]]
my_df = pd.DataFrame(my_list)
my_df

In [None]:
# define the columns of the dataframe
my_df = pd.DataFrame(my_list, columns = ["Name", "ID"])
my_df


###############################################
# Pandas – Exploring and Understanding Data
###############################################

In [None]:
# upload a file to the colab environment
from google.colab import files
uploaded = files.upload()

In [None]:
# extract data from excel
df = pd.read_excel("grocery_database.xlsx", sheet_name = "transactions")

### Basic Exploration of the Data

In [None]:
# how many rows and columns
df.shape

In [None]:
# a subset of rows
df.head() # by default: top 5
df.tail() # by default: last 5

In [None]:
# a subset of rows - top N rows, last N rows
df.head(20) # top 20 rows
df.tail(15) # last 15 rows

In [None]:
# random sample - a single row
df.sample()

In [None]:
# random sample - N rows
df.sample(10) # 10 random rows

In [None]:
# proportion of randomly selected rows - n%
df.sample(frac = 0.1) # 10% of the data, randomly selected rows

### Further Exploration of the Data

In [None]:
# summary statistics
df.describe()

In [None]:
# outlier detection - quick, very basic overview
# 1. extract top N rows based on a specific field
df.nlargest(5, "sales_cost") # top 5 highest values of the sales_cost field

In [None]:
# 2. extract last N rows based on a specific field
df.nsmallest(5, "sales_cost") # last 5 lowest values of the sales_cost field

In [None]:
# number of the unique values for each field
df.nunique()

In [None]:
# missing values  = null values
df.isna().sum()

#########################################
# Pandas – Accessing Columns
#########################################


In [None]:
# access (select) a specific field

series = df["customer_id"] # returns a Pandas Series
data_frame = df[["customer_id"]] # returns a Pandas DataFrame


#########################################
# Pandas – Adding and Dropping Columns
#########################################


In [None]:
# create additional field and set a value
df["store_id"] = 1

In [None]:
# create additional field based around the value of another one
df["profit"] = df["sales_cost"]*0.2
# This means that the profit of the sales cost is 20%.

In [None]:
# apply some if else logic with "where" statement
import numpy as np

df["sales_type"] = np.where(df["sales_cost"] > 20, "Large", "Small")
df[["sales_cost", "sales_type"]]

In [None]:
# apply some if else logic with "select" statement

condition_rules = [df["sales_cost"] > 50,   # X-Large
                   df["sales_cost"] > 20,   # Large
                   df["sales_cost"] > 10]   # Medium

# create a list of outcomes that are associated with the condition rules
outcomes = ["X-Large", "Large", "Medium"]

df["sales_type"] = np.select(condition_rules, outcomes, default = "Small")

In [None]:
# remove a column
new_df = df.drop(["sales_cost"], axis = 1)
# axis= 0 to remove a row