In [1]:
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df = pd.DataFrame(data)

# Accessing a value using .at[] (row = 1, column = 'Age')
age_of_bob = df.at[1, 'Age']  # This will get the value of Bob's age, which is 30

# Modifying a value using .at[] (setting Bob's age to 32)
df.at[1, 'Age'] = 32  # This will change Bob's age from 30 to 32

# Display the modified DataFrame
df


Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,32,Los Angeles
2,Charlie,35,Chicago


##### usage of sep

In [None]:
# Loading a semicolon-separated CSV
df = pd.read_csv('example_semicolon.csv', sep=';')
# Loading a tab-separated CSV (TSV)
df = pd.read_csv('example_tab.tsv', sep='\t')
# Loading a space-separated CSV
df = pd.read_csv('example_space.csv', sep=' ')

##### Customize headers

In [None]:
import pandas as pd

# Default Behavior: Using the first row as headers
# The first row of this CSV contains the headers.
# Example CSV:
# Name,Age,City
# Alice,25,New York
# Bob,30,Los Angeles
# Charlie,35,Chicago
df = pd.read_csv('example.csv')

# Displaying the DataFrame will show the first row as headers
# df
# Output:
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago


# No Header in CSV (header=None): When the file has no headers.
# This will treat the first row as data, and assign integer columns (0, 1, 2, etc.).
# Example CSV:
# Alice,25,New York
# Bob,30,Los Angeles
# Charlie,35,Chicago
df_no_header = pd.read_csv('example_no_header.csv', header=None)

# Displaying the DataFrame will show integer indices as column names
# df_no_header
# Output:
#          0   1            2
# 0     Alice  25     New York
# 1       Bob  30  Los Angeles
# 2   Charlie  35      Chicago


# Custom Header Row (header=<row number>): Specifying which row contains the headers.
# If the actual header is on the second row (index 1), use `header=1`.
# Example CSV:
# Data,Data,Data
# Name,Age,City
# Alice,25,New York
# Bob,30,Los Angeles
# Charlie,35,Chicago
df_custom_header = pd.read_csv('example_custom_header.csv', header=1)

# Displaying the DataFrame will show that the second row is used as the header
# df_custom_header
# Output:
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago


# Custom Column Names (names=[...]): Overriding the file's headers by specifying your own column names.
# Example CSV:
# Alice,25,New York
# Bob,30,Los Angeles
# Charlie,35,Chicago
df_custom_names = pd.read_csv('example_no_header.csv', header=None, names=['Name', 'Age', 'City'])

# Displaying the DataFrame will show the custom column names you provided
# df_custom_names
# Output:
#       Name  Age         City
# 0    Alice   25     New York
# 1      Bob   30  Los Angeles
# 2  Charlie   35      Chicago


# Handling Multi-Row Headers (header=[0, 1]): Dealing with multiple header rows.
# Use this for multi-index column headers.
# Example CSV:
# Metric,Metric,Metric
# Name,Age,City
# Alice,25,New York
# Bob,30,Los Angeles
# Charlie,35,Chicago
df_multi_header = pd.read_csv('example_multi_header.csv', header=[0, 1])

# Displaying the DataFrame will show multi-level headers
# df_multi_header
# Output:
#   Metric         
#     Name Age         City
# 0  Alice  25     New York
# 1    Bob  30  Los Angeles
# 2 Charlie  35      Chicago


##### Skip rows and usecols

In [None]:
import pandas as pd

# Example 1: Skipping Rows and Selecting Columns
# CSV file content:
# # Metadata row 1
# # Metadata row 2
# Name,Age,City,Country,Phone
# Alice,25,New York,USA,1234567890
# Bob,30,Los Angeles,USA,2345678901
# Charlie,35,Chicago,USA,3456789012
df = pd.read_csv('example_skip_and_select.csv', skiprows=2, usecols=['Name', 'City'])

# Display the DataFrame
# df
# Output:
#       Name      City
# 0    Alice  New York
# 1      Bob  Los Angeles
# 2  Charlie   Chicago


# Example 2: Skipping Specific Rows and Selecting Columns
# CSV file content:
# # Comment: This is a comment
# # Comment: Another comment
# Name,Age,City,Country,Phone
# Alice,25,New York,USA,1234567890
# Bob,30,Los Angeles,USA,2345678901
# Charlie,35,Chicago,USA,3456789012

# Load the CSV, skipping rows containing comments and selecting specific columns
df = pd.read_csv('example_skip_comments_and_select.csv', skiprows=lambda x: x.startswith('Comment'), usecols=['Name', 'City'])

# Display the DataFrame
# df
# Output:
#       Name      City
# 0    Alice  New York
# 1      Bob  Los Angeles
# 2  Charlie   Chicago


# Example 3: Skipping Rows by Index and Selecting Columns
# CSV file content:
# Skip,Keep,Skip
# 1,Name,Age,City
# 2,Alice,25,New York
# 3,Bob,30,Los Angeles
# 4,Charlie,35,Chicago

# Load the CSV, skipping specific rows and selecting specific columns
df = pd.read_csv('example_skip_by_index_and_select.csv', skiprows=[0, 2], usecols=['Name', 'City'])

# Display the DataFrame
# df
# Output:
#       Name      City
# 0    Alice  New York
# 1  Charlie   Chicago


##### Date coverstion and merged cells

In [None]:
import pandas as pd

# Example 1: Automatic Date Parsing
# Example Excel file content:
# Name,Date of Birth
# Alice,01/15/1990
# Bob,12/22/1985
# Charlie,07/30/1978

# Load Excel file with automatic date parsing
df = pd.read_excel('example_dates.xlsx')

# Display the DataFrame and check data types
# print(df)
# print(df.dtypes)
# Output:
#       Name Date of Birth
# 0    Alice    1990-01-15
# 1      Bob    1985-12-22
# 2  Charlie    1978-07-30
# Data types:
# Name                 object
# Date of Birth datetime64[ns]
# dtype: object


# Example 2: Explicit Date Conversion
# Example Excel file content:
# Name,Date of Birth
# Alice,01/15/1990
# Bob,12/22/1985
# Charlie,07/30/1978

# Load Excel file without automatic date parsing
df = pd.read_excel('example_dates.xlsx', parse_dates=False)

# Convert the 'Date of Birth' column to datetime
df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], format='%m/%d/%Y')

# Display the DataFrame and check data types
# print(df)
# print(df.dtypes)
# Output:
#       Name Date of Birth
# 0    Alice    1990-01-15
# 1      Bob    1985-12-22
# 2  Charlie    1978-07-30
# Data types:
# Name                 object
# Date of Birth datetime64[ns]
# dtype: object


# Example 3: Handling Merged Cells
# Example Excel file content:
# Region,Name,Sales
# North, Alice, 200
#       Bob,   150
# South, Charlie, 300

# Load Excel file with merged cells
df = pd.read_excel('example_merged_cells.xlsx', header=0)

# Display the DataFrame
# print(df)
# Output:
#   Region     Name  Sales
# 0  North    Alice    200
# 1  North      Bob    150
# 2  South  Charlie    300


##### Using na_values and keep_default_na

In [None]:
import pandas as pd

# Example 1: Specifying a Single Missing Value Marker
# CSV file content:
# Name,Age,City
# Alice,25,New York
# Bob,N/A,Los Angeles
# Charlie,35,Chicago
df = pd.read_csv('example_na_values.csv', na_values='N/A')

# Display the DataFrame
# print(df)
# Output:
#      Name   Age         City
# 0    Alice  25.0     New York
# 1      Bob   NaN  Los Angeles
# 2  Charlie  35.0      Chicago


# Example 2: Specifying Multiple Missing Value Markers
# CSV file content:
# Name,Age,City
# Alice,25,New York
# Bob,NA,Los Angeles
# Charlie,--,Chicago
df = pd.read_csv('example_multiple_na_values.csv', na_values=['NA', '--'])

# Display the DataFrame
# print(df)
# Output:
#      Name   Age         City
# 0    Alice  25.0     New York
# 1      Bob   NaN  Los Angeles
# 2  Charlie   NaN      Chicago


# Example 3: Using a Dictionary for Different Columns
# CSV file content:
# Name,Age,City
# Alice,25,New York
# Bob,N/A,Los Angeles
# Charlie,,Chicago
df = pd.read_csv('example_dict_na_values.csv', na_values={'Age': 'N/A', 'City': ''})

# Display the DataFrame
# print(df)
# Output:
#      Name   Age         City
# 0    Alice  25.0     New York
# 1      Bob   NaN  Los Angeles
# 2  Charlie   NaN      Chicago


# Example 4: Keeping Default NA Values
# CSV file content:
# Name,Age,City
# Alice,25,New York
# Bob,N/A,Los Angeles
# Charlie,,Chicago
df = pd.read_csv('example_default_na_values.csv', na_values='N/A', keep_default_na=True)# this is default

# Display the DataFrame
# print(df)
# Output:
#      Name   Age         City
# 0    Alice  25.0     New York
# 1      Bob   NaN  Los Angeles
# 2  Charlie   NaN      Chicago


# Example 5: Not Keeping Default NA Values
# CSV file content:
# Name,Age,City
# Alice,25,New York
# Bob,N/A,Los Angeles
# Charlie,,Chicago
df = pd.read_csv('example_default_na_values.csv', na_values='N/A', keep_default_na=False)

# Display the DataFrame
# print(df)
# Output:
#      Name   Age         City
# 0    Alice  25.0     New York
# 1      Bob   NaN  Los Angeles
# 2  Charlie     NaN      Chicago


##### Chunksize usage

In [None]:
import pandas as pd

# Example 1: Basic Chunking
# CSV file content: (A large dataset)
# Name,Age,City
# Alice,25,New York
# Bob,30,Los Angeles
# Charlie,35,Chicago
# ...
chunk_size = 2

# Initialize an empty list to collect processed chunks
chunks = []

# Read the CSV file in chunks
for chunk in pd.read_csv('large_data.csv', chunksize=chunk_size):
    # Process each chunk (e.g., print it)
    print(chunk)
    # Optionally, append the chunk to a list for further processing
    chunks.append(chunk)

# Concatenate all chunks into a single DataFrame if needed
df = pd.concat(chunks, ignore_index=True)
# print(df)
# Output: DataFrame with all rows from the CSV file


# Example 2: Processing Chunks Incrementally
# Calculate average age from a large CSV file
chunk_size = 2

# Initialize variables for aggregation
total_age = 0
total_count = 0

# Read the CSV file in chunks
for chunk in pd.read_csv('large_data.csv', chunksize=chunk_size):
    # Update the total age and count
    total_age += chunk['Age'].sum()
    total_count += chunk['Age'].count()

# Calculate average age
average_age = total_age / total_count
print(f'Average Age: {average_age}')
# Output: Average Age: <computed_value>


# Example 3: Writing Processed Chunks to a New CSV
# Filter rows where Age > 30 and write to a new Excel file
chunk_size = 2

# Open a file to write the processed data
with pd.ExcelWriter('processed_data.xlsx') as writer:
    # Read the CSV file in chunks
    for i, chunk in enumerate(pd.read_csv('large_data.csv', chunksize=chunk_size)):
        # Example processing: filter rows where Age > 30
        processed_chunk = chunk[chunk['Age'] > 30]
        
        # Write each processed chunk to a new Excel sheet
        processed_chunk.to_excel(writer, sheet_name=f'Chunk_{i}', index=False)

print('Data processing complete.')
# Output: Processed data saved to 'processed_data.xlsx'


# Date conversion

### **Date Conversion: CSV vs. Excel**

#### **1. CSV Files**

- **Format**: CSV files are plain text files without inherent formatting or metadata. Dates are stored as text strings.
- **Conversion Process**: When reading a CSV file, dates are treated as text. You need to manually convert these text strings to datetime objects using functions like `pd.to_datetime()` in Pandas.
- **Example**: 
  - CSV Data: `"2024-09-01"`
  - Requires explicit conversion: `pd.to_datetime(df['Date'])`

#### **2. Excel Files**

- **Format**: Excel files can store dates with formatting and metadata. Dates are often stored in a recognized date format, which Excel can interpret directly.
- **Conversion Process**: Pandas can automatically recognize and convert date columns to datetime objects when reading Excel files. However, you may still need to convert columns explicitly to ensure they are in the desired format.
- **Example**:
  - Excel Data: Formatted as a date cell (`2024-09-01`)
  - Often recognized automatically: `pd.read_excel()` typically reads date columns as datetime objects, but `pd.to_datetime()` can be used for explicit conversion.

In summary, **CSV** requires manual conversion of text-based dates, while **Excel** can often automatically interpret and convert date formats.

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('dates.csv')

# Display the DataFrame before conversion
print("Before conversion:")
print(df)

# Convert the 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Display the DataFrame after conversion
print("\nAfter conversion:")
print(df)
