In [2]:
import pandas as pd
import numpy as np

# Creating a DataFrame
# A DataFrame is a 2-dimensional labeled data structure with columns of potentially different types.
# It is similar to a spreadsheet, a SQL table, or a dictionary of series objects.
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
df = pd.DataFrame(data)

# Displaying the DataFrame
# The 'print' function outputs the DataFrame so you can see its structure and contents.
print("DataFrame:")
print(df)

# Accessing data
# You can access DataFrame data using loc, iloc, and direct indexing of columns.
# 'df['Age']' directly accesses the 'Age' column of the DataFrame.
print("\nAccess specific column (Age):")
print(df['Age'])

# 'df[['Name', 'City']]' accesses multiple columns and returns a new DataFrame containing only those columns.
print("\nAccess multiple columns (Name and City):")
print(df[['Name', 'City']])

# 'loc' is label-based, which means that you have to specify the name of the rows and columns that you need to filter out.
# For example, 'df.loc[2]' returns the row as a Series with label index 2.
print("\nAccess a row by index label using loc (index 2):")
print(df.loc[2])

# 'iloc' is integer index-based. You use integer indices to access rows or columns.
# For example, 'df.iloc[2]' accesses the row at integer index 2 (third row).
print("\nAccess a row by integer location using iloc (index 2):")
print(df.iloc[2])

# Adding a new column
# A new column 'Salary' is added to the DataFrame. This operation assigns the list of salaries to a new column in the DataFrame.
df['Salary'] = [50000, 54000, 50000, 62000, 58000]
print("\nDataFrame with new column 'Salary':")
print(df)

# Basic data manipulation
# 'df['Age'].mean()' computes the mean (average) of the 'Age' column.
mean_age = df['Age'].mean()
print("\nMean Age:", mean_age)

# Filtering data
# This operation filters the DataFrame to include only rows where 'Age' is greater than 30.
# It uses a boolean condition that checks each row and includes it in the result if the condition is true.
print("\nRows where Age is greater than 30:")
print(df[df['Age'] > 30])

# Summarizing data
# 'df.describe()' provides a summary of statistics pertaining to DataFrame columns.
# This includes count, mean, standard deviation, min, quartiles, and max values for numeric columns.
print("\nSummary statistics for numerical columns:")
print(df.describe())

# Handling missing data
# Introduces a missing value (NaN) in the 'Salary' column for demonstration.
df.loc[2, 'Salary'] = np.nan

# The DataFrame is displayed with a missing value in the 'Salary' column.
print("\nDataFrame with a missing 'Salary' value:")
print(df)

# Filling missing data
# 'fillna' replaces all NaN or missing values in the 'Salary' column with the mean of the remaining non-null salaries.
# 'inplace=True' modifies the DataFrame in-place (no need to assign the result to a new DataFrame).
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
print("\nDataFrame after filling missing 'Salary' with mean:")
print(df)


DataFrame:
      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston
4      Eva   45      Phoenix

Access specific column (Age):
0    25
1    30
2    35
3    40
4    45
Name: Age, dtype: int64

Access multiple columns (Name and City):
      Name         City
0    Alice     New York
1      Bob  Los Angeles
2  Charlie      Chicago
3    David      Houston
4      Eva      Phoenix

Access a row by index label using loc (index 2):
Name    Charlie
Age          35
City    Chicago
Name: 2, dtype: object

Access a row by integer location using iloc (index 2):
Name    Charlie
Age          35
City    Chicago
Name: 2, dtype: object

DataFrame with new column 'Salary':
      Name  Age         City  Salary
0    Alice   25     New York   50000
1      Bob   30  Los Angeles   54000
2  Charlie   35      Chicago   50000
3    David   40      Houston   62000
4      Eva   45      Phoenix   58000

Mean Age: 35.0

Rows where

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)
