# Pandas


In [None]:
# Importing the pandas package

import pandas as pd

## DataFrames and Series

- DataFrames are similar to Excel sheet.
- Series are similar to a single column in an Excel sheet.


In [None]:
# Sample sales data

sales_data = {"apple": [3, 2, 0, 1], "orange": [0, 3, 7, 2], "banana": [1, 0, 2, 4]}

customers = ["June", "Robert", "Lily", "David"]

In [None]:
# Create a DataFrame from the dictionary 'sales_data'

sales = pd.DataFrame(sales_data)
display(sales)

In [None]:
# Create a DataFrame with custom row labels

sales = pd.DataFrame(sales_data, index=customers)
sales.index.name = "Customers"
display(sales)

In [None]:
# Create series for a single column

sales_apple = sales["apple"]
display(sales_apple)

## Importing and Exporting File


In [None]:
# If using Google Colab, uncomment these lines
# from google.colab import drive
# drive.mount('/content/drive')


In [None]:
# If using Google Colab, uncomment these lines
# import os
# os.chdir('/content/drive/MyDrive/Colab Notebooks') # Change to your working directory
# !ls  # List files in the current directory to verify path change

In [None]:
# Reading DataFrame from CSV

sales_csv = pd.read_csv("sales_data.csv", index_col="Customers")
display(sales_csv)

In [None]:
# Reading DataFrame from Excel

sales_excel = pd.read_excel("sales_data.xlsx", index_col="Customers")
display(sales_excel)

In [None]:
# Exporting DataFrame to CSV and Excel

sales.to_csv("sales_data_exported.csv")
sales.to_excel("sales_data_exported.xlsx")

## Slicing DataFrames and Series

Difference between loc and iloc

- .loc is used for label-based indexing, which means you have to specify the names of the rows and columns that you want to filter out.
- .iloc is used for positional indexing, which means you have to specify rows and columns by their integer position values (0-based index).


In [None]:
# Locating DataFrame by row/column labels using .loc

# Method 1: Single row
display(sales.loc["June"])

# Method 2: Slicing rows
display(sales.loc["June":"Lily"])  # Notice that the last item is not excluded.

# Method 3: Selecting specific columns
display(sales.loc["June":"Lily", ["apple"]])

# Method 4: Slicing both rows and columns
display(sales.loc["June":"Lily", "apple":"orange"])

# Methhod 5: Selecting specific rows and columns
display(sales.loc[["June", "David"], ["orange", "banana"]])

In [None]:
# Slicing DataFrame by row/column positions using .iloc

# Method 1: Single row
display(sales.iloc[0])  # First row

# Method 2: Slicing rows
display(sales.iloc[0:3])  # First three rows

# Method 3: Selecting specific columns
display(sales.iloc[0:3, [0]])  # First three rows, first

# Method 4: Slicing both rows and columns
display(sales.iloc[0:3, 0:2])  # First three rows

# Method 5: Selecting specific rows and columns
display(sales.iloc[[0, 3], [1, 2]])  # First and fourth rows, second and third columns

In [None]:
# If you only need columns, you can use the column names directly
display(sales[["apple", "banana"]])

In [None]:
# Slicing the Series by index labels by using .loc

sales_apple.loc["June":"Lily"]

In [None]:
# Slicing the Series by index positions by using .iloc

sales_apple.iloc[0:3]

## IMDB Movie Dataset


In [None]:
movies = pd.read_csv("IMDB-Movie-Data.csv", index_col="Title")
display(movies)

# You can use this link too.
# url = 'https://raw.githubusercontent.com/im-data-68/lectures/refs/heads/main/src/T02_basic_python_part_2/exercises/IMDB-Movie-Data.csv'
# movies = pd.read_csv(url, index_col="Title")

In [None]:
# If using Google Colab, you can use this library for nice data viewing. Uncomment the following lines
# from google.colab import data_table
# data_table.enable_dataframe_formatter()
# movies

In [None]:
# Display the first 5 rows of the DataFrame

movies.head()

In [None]:
# Display the last 5 rows of the DataFrame

movies.tail()

In [None]:
# Display DataFrame summary information

movies.info()

In [None]:
# Display the shape of the DataFrame

movies.shape

In [None]:
# Display the column names of the DataFrame

movies.columns

In [None]:
# Renaming columns

movies.rename(
    columns={
        "Runtime (Minutes)": "Runtime_Minutes",
        "Revenue (Millions)": "Revenue_Millions",
    },
    inplace=True,
)

movies.columns

In [None]:
# You can set all column names at once
new_column_names = [
    "rank",
    "genre",
    "description",
    "director",
    "actors",
    "year",
    "runtime_minutes",
    "rating",
    "votes",
    "revenue_millions",
    "metascore",
]

movies.columns = new_column_names

In [None]:
# Better way to rename columns is to use a loop

c = movies.columns
cnew = []
for name in c:
    cnew.append(name.lower())

movies.columns = cnew
print(movies.columns)

In [None]:
# Another way using list comprehension
movies.columns = [col.lower() for col in movies]
print(movies.columns)

In [None]:
# Display basic statistical details like percentile, mean, std etc. of a DataFrame
movies.describe()


In [None]:
# Display basic statistical details of a specific column. This is Series.
movies["genre"].describe()

In [None]:
# Count the number of unique values in a specific column
movies["genre"].nunique()
movies["genre"].value_counts()
movies["genre"].value_counts().head(10)

In [None]:
# Inspect specific columns
movies[["genre", "rating"]]

In [None]:
# Inspect specific row using .loc
movies.loc["Prometheus"]


In [None]:
# Inspect specific row using .iloc
movies.iloc[1:4]

In [None]:
# Filtering DataFrame using a condition
filt = movies["director"] == "Ridley Scott"
filt.head()
movies[filt]
movies[movies["director"] == "Ridley Scott"]

In [None]:
# Filtering DataFrame using multiple conditions
filt1 = movies["director"] == "Ridley Scott"
filt2 = movies["rating"] >= 8.6
movies[filt1 & filt2]

In [None]:
# Using .isin() for filtering based on multiple values
filt = movies["director"].isin(["Christopher Nolan", "Ridley Scott"])
movies[filt]

In [None]:
# Filtering movies released between 2005 and 2010 (inclusive) with a rating above 8.0 and revenue in the lowest 25th percentile

filt1 = movies["year"] >= 2005
filt2 = movies["year"] <= 2010
filt3 = movies["rating"] > 8.0
filt4 = movies["revenue_millions"] < movies["revenue_millions"].quantile(0.25)

movies[filt1 & filt2 & filt3 & filt4]

In [None]:
# Create a new column name "rating_category" based on existing "rating" column
def rating_function(x):
    if x >= 8.0:
        return "good"
    else:
        return "bad"


movies["rating_category"] = movies["rating"].apply(rating_function)

In [None]:
# Shorter way using lambda function

movies["rating_category"] = movies["rating"].apply(
    lambda x: "good" if x >= 8.0 else "bad"
)