# BigData-Session-1 Solution: Fundamentals of Pandas

This notebook provides complete solutions for the BigData-session-1 exercises.
It demonstrates fundamental Pandas operations for data manipulation.

## 1. Setup and Library Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Working with Series

In [None]:
# SOLUTION: Create a Series object
CPU_USAGE = pd.Series([0.16, 0.07, 0.23, 0.24, 0.14, 4.99, 0.23, 0.47, 0.46, 0.17])

print("CPU_USAGE Series:")
print(CPU_USAGE)
print(f"\nLength: {len(CPU_USAGE)}")
print(f"Data type: {CPU_USAGE.dtype}")

In [None]:
# SOLUTION: Access elements in Series
print("Accessing individual elements:")
print(f"CPU_USAGE[0] = {CPU_USAGE[0]}")
print(f"CPU_USAGE[1] = {CPU_USAGE[1]}")
print(f"CPU_USAGE[3] = {CPU_USAGE[3]}")
print(f"CPU_USAGE[5] = {CPU_USAGE[5]}")

In [None]:
# SOLUTION: Series with irregular index
MEM_USAGE = pd.Series([0.16, 0.07, 0.23, 0.24, 0.14],
                      index=[3, 1, 4, 2, 7])

print("MEM_USAGE Series with irregular index:")
print(MEM_USAGE)
print(f"\nMEM_USAGE[3] = {MEM_USAGE[3]}")
print(f"MEM_USAGE[4] = {MEM_USAGE[4]}")
print(f"MEM_USAGE[1] = {MEM_USAGE[1]}")

In [None]:
# SOLUTION: Series operations
print("Series Operations:")
print(f"Mean: {CPU_USAGE.mean():.4f}")
print(f"Median: {CPU_USAGE.median():.4f}")
print(f"Std Dev: {CPU_USAGE.std():.4f}")
print(f"Min: {CPU_USAGE.min():.4f}")
print(f"Max: {CPU_USAGE.max():.4f}")
print(f"\nDescriptive Statistics:")
print(CPU_USAGE.describe())

## 3. Working with DataFrames

In [None]:
# SOLUTION: Create a DataFrame
data = {
    'CPU_USAGE': [0.16, 0.07, 0.23, 0.24, 0.14, 4.99, 0.23, 0.47, 0.46, 0.17],
    'MEM_USAGE': [0.45, 0.38, 0.52, 0.48, 0.41, 0.89, 0.50, 0.55, 0.54, 0.42],
    'DISK_IO': [10, 5, 15, 12, 8, 50, 14, 20, 18, 9]
}

df = pd.DataFrame(data)

print("DataFrame:")
print(df)
print(f"\nShape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

In [None]:
# SOLUTION: Access DataFrame elements
print("Accessing DataFrame elements:")
print(f"\nFirst column (CPU_USAGE):")
print(df['CPU_USAGE'])
print(f"\nFirst row:")
print(df.iloc[0])
print(f"\nElement at row 2, column 'MEM_USAGE': {df.loc[2, 'MEM_USAGE']}")

In [None]:
# SOLUTION: DataFrame operations
print("DataFrame Operations:")
print(f"\nDescriptive Statistics:")
print(df.describe())
print(f"\nData types:")
print(df.dtypes)
print(f"\nInfo:")
df.info()

## 4. Loading Real Data

In [None]:
# SOLUTION: Load Sherlock dataset
try:
    df_sherlock = pd.read_csv('sherlock/sherlock_mystery_2apps.csv')
    print(f"Sherlock dataset loaded successfully!")
    print(f"Shape: {df_sherlock.shape}")
    print(f"\nFirst few rows:")
    print(df_sherlock.head())
    print(f"\nColumn names:")
    print(df_sherlock.columns.tolist())
except FileNotFoundError:
    print("Sherlock dataset not found. Please check the file path.")

In [None]:
# SOLUTION: Explore dataset
if 'df_sherlock' in locals():
    print("Dataset Information:")
    print(f"Shape: {df_sherlock.shape}")
    print(f"\nData types:")
    print(df_sherlock.dtypes)
    print(f"\nMissing values:")
    print(df_sherlock.isnull().sum())
    print(f"\nBasic statistics:")
    print(df_sherlock.describe())

## 5. Visualization

In [None]:
# SOLUTION: Visualize Series
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Line plot
axes[0].plot(CPU_USAGE, marker='o', linewidth=2, markersize=8)
axes[0].set_xlabel('Index', fontsize=11)
axes[0].set_ylabel('CPU Usage', fontsize=11)
axes[0].set_title('CPU Usage Over Time', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Histogram
axes[1].hist(CPU_USAGE, bins=10, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('CPU Usage', fontsize=11)
axes[1].set_ylabel('Frequency', fontsize=11)
axes[1].set_title('CPU Usage Distribution', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

In [None]:
# SOLUTION: Visualize DataFrame
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, col in enumerate(df.columns):
    axes[idx].bar(range(len(df)), df[col], alpha=0.7, edgecolor='black')
    axes[idx].set_xlabel('Index', fontsize=11)
    axes[idx].set_ylabel(col, fontsize=11)
    axes[idx].set_title(f'{col} Distribution', fontsize=12, fontweight='bold')
    axes[idx].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 6. Key Concepts Summary

### Series:
- 1-D labeled array of values
- Has index, values, and data type
- Can have custom or default index
- Supports mathematical operations

### DataFrame:
- 2-D tabular data with row and column labels
- Multiple columns of different data types
- Can be created from dictionaries, lists, or files
- Supports filtering, grouping, and aggregation

### Common Operations:
- `head()`, `tail()`: View first/last rows
- `describe()`: Get statistical summary
- `info()`: Get data types and missing values
- `mean()`, `median()`, `std()`: Statistical functions
- `isnull()`: Check for missing values
- Indexing with `[]`, `.loc[]`, `.iloc[]`