<a href="https://colab.research.google.com/github/hejnal/py-study-pandas/blob/main/notebooks/PyStudy_Group_13_Exercise_1_Pandas_ipynb_%5BMAKE_A_COPY%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Examples

## Imports

In [1]:
# Imports
import numpy as np
import pandas as pd

## Dataframe creation

In [None]:
# series
s = pd.Series([1, 3, 5, np.nan, 6, 8, 'txt'])
print(s)

# dataframe
dates= pd.date_range(start='2023-01-01', periods=6)
print(dates)

df = pd.DataFrame(np.random.randn(6, 5), index=dates, columns=list('ABCDE'))
print(df)

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp('2023-01-01'),
        "C": pd.Series(1, index=list(range(4)), dtype='float32'),
        "D": np.array([3] * 4, dtype='int32'),
        "E": pd.Categorical(["test", "train", "val", "test"]),
        "F": "foo"
   }
)
print(df2)
print(df2.dtypes)

## Basic DataFrames Operations

Inspect dataframes

In [None]:
print(df2.B[3])
df2.head()

Sort Dataframes

In [None]:
df2.columns
df2.to_numpy()

df2.describe()
df2.T

df2.sort_index(axis=1, ascending=True)
df2.sort_values(by='E', ascending=True)


"""
While standard Python / NumPy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, DataFrame.at(), DataFrame.iat(), DataFrame.loc() and DataFrame.iloc().
"""

Access to the dataframe with different methods (loc, iloc, at, etc)

In [None]:
df["A"]
df.A

df[:3]

df['2023-01-01':'2023-01-02']

df.loc[dates[0]]

df.loc[:, ["A", "B"]]

df.loc["2023-01-01":"2023-01-02", ["A", "B"]]
df.loc[dates[0], "A"]
df.at[dates[0], "A"]


# Selection by position
df.iloc[3]
# Integer slices acts similar to NumPy/Python:
df.iloc[3:5, 0:2]

df.iloc[[0, 2, 4], 0]
df.iloc[[0, 2, 4], :]
df.iloc[:, 1:3]
df.iloc[1, 1]
df[df["A"] > 0]
df[df > 0]

df2 = df.copy()
df2["E"] = ["one", "two", "three", "four", "three", "six"]
df2["E"].isin(["two", "three"])


Setting Values

In [None]:
s1 = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9], index=pd.date_range(start='2023-01-01', periods=9))

df["F"] = s1
df.at[dates[0], "A"] = 0
df.iat[0, 1]= 0
df.loc[:, "D"] = np.array([5] * len(df))
df.loc[:, "D"]



Missing Data

In [None]:
# Missing data
df.dropna(how="any")

pd.isna(df)

# calculate mean for each column
df.mean(axis=0) # or df.mean()

# calculate mean for each row
df.mean(axis=1)


## Aggregation

In [None]:
df.agg(lambda x: np.mean(x) * 5.6)

In [None]:
df.transform(lambda x: x * 100)

In [None]:
s = pd.Series(np.random.randint(0, 7, size=10))
s.value_counts().sort_index()

## Joins (Merges)

In [None]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})

In [None]:
pd.merge(left, right, on="key", how="outer")

## Grouping

In [None]:
df = pd.DataFrame(
    {
        "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
        "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
        "C": np.random.randn(8),
        "D": np.random.randn(8),
    }
)

df

In [None]:
df.groupby("A")[["C", "D"]].sum()

In [None]:
df.groupby(["A", "B"]).sum()

## Timeseries

In [None]:
rng = pd.date_range("1/1/2012", periods=1000, freq="s")
ts = pd.Series(np.random.randn(len(rng)), rng)
ts_utc = ts.tz_localize("UTC")
ts_utc.resample("5Min").sum()


## Categorical

In [None]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df

In [None]:
df["grade"] = df["raw_grade"].astype("category")
new_categories = ["very good", "good", "very bad"]

df["grade"] = df["grade"].cat.rename_categories(new_categories)
df

## Plotting

In [None]:
import matplotlib.pyplot as plt
plt.close("all")

In [None]:
ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000))

ts = ts.cumsum()

ts.plot();


In [None]:
df = pd.DataFrame(
    np.random.randn(1000, 4), index=ts.index, columns=["A", "B", "C", "D"]
)

df = df.cumsum()

plt.figure();

df.plot();

plt.legend(loc='best');

# Exercises

### Exercise 1

In [None]:
# Create a DataFrame with the following data:
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 28],
        'City': ['New York', 'Los Angeles', 'Chicago']}

# Display the first 2 rows of the DataFrame. [TODO]
# Display the 'Name' and 'City' columns. [TODO]

In [None]:
# @title Solution
import pandas as pd

df = pd.DataFrame(data)

# Display the first 2 rows
print(df.head(2))

# Display the 'Name' and 'City' columns
print(df[['Name', 'City']])

### Exercise 2

In [None]:
# Create a DataFrame with the following data:
data = {'Item': ['Apple', 'Banana', 'Orange', 'Apple', 'Banana'],
        'Price': [1.0, 0.5, 0.75, 1.2, 0.6]}

# Calculate the mean price of each item. [TODO]

In [None]:
# @title Solution
import pandas as pd

# Create the DataFrame
df = pd.DataFrame(data)

# Calculate the mean price of each item
print(df.groupby('Item').mean())

### Exercise 3

In [None]:
# Create two DataFrames with the following data:
data1 = {'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']}
data2 = {'ID': [2, 3, 4], 'Score': [85, 92, 78]}

# Join the two DataFrames on the ID column (using join) [TODO]
# Join the two DataFrames on the ID column (using merge) [TODO]

In [None]:
# @title Solution
import pandas as pd

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Join the two DataFrames on the `ID` column (without setting the index to ID)
print(df1.join(df2, lsuffix='_caller', rsuffix='_other'))
print(df1.merge(df2, left_on="ID", right_on="ID", suffixes=['_caller', '_other'], how="left"))

# Set 'ID' as the index for both DataFrames
df1 = df1.set_index('ID')
df2 = df2.set_index('ID')

# Join the two DataFrames on the `ID` column
print(df1.join(df2, lsuffix='_caller', rsuffix='_other'))
print(df1.merge(df2, left_on="ID", right_on="ID", suffixes=['_caller', '_other'], how="left"))

### Exercise 4

In [None]:
# Create a DataFrame with the following data:
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 28, 22],
        'City': ['New York', 'Los Angeles', 'Chicago', 'Houston'],
        'Salary': [60000, 75000, 55000, 48000]}

df = pd.DataFrame(data)

# Filter the DataFrame to include only employees who are older than 25 and have a salary greater than 50000. [TODO]

In [None]:
# @title Solution

# Filter the DataFrame
filtered_df = df[(df['Age'] > 25) & (df['Salary'] > 50000)]
print(filtered_df)

### Exercise 5

In [None]:
# Create a DataFrame with the following data:
data = {'Value1': [10, 20, 30],
        'Value2': [40, 50, 60]}

df = pd.DataFrame(data)

# Use the apply() method to calculate the product of Value1 and Value2 for each row, and store the result in a new column called Product. [TODO]

In [None]:
# @title Solution

# Calculate the product using apply()
df['Product'] = df.apply(lambda row: row['Value1'] * row['Value2'], axis=1)
print(df)

### Exercise 6

In [None]:
# Create two DataFrames with the following data:

employee_data = {'EmployeeID': [1, 2, 3, 4, 5],
                'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
                'Department': ['Sales', 'Marketing', 'Sales', 'Finance', 'IT'],
                'Salary': [60000, 75000, 62000, 80000, 70000],
                'Age': [25, 30, 28, 35, 28]}

project_data = {'ProjectID': [101, 102, 103, 104],
                'ProjectName': ['Alpha', 'Beta', 'Gamma', 'Delta'],
                'Department': ['Sales', 'Marketing', 'Sales', 'Finance'],
                'Budget': [100000, 150000, 120000, 200000]}

# Merge the two DataFrames on the Department column, keeping all rows from the employee_data DataFrame. [TODO]

# Calculate the average Salary for each Department in the merged DataFrame. [TODO]

# Filter the merged DataFrame to include only employees who are older than the average age of employees in their respective departments. [TODO]

# Group the filtered DataFrame by ProjectName and calculate the sum of Salary and Budget for each project. [TODO]

In [None]:
# @title Solution
import pandas as pd

# Create the DataFrames
employee_data = {'EmployeeID': [1, 2, 3, 4, 5],
                'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily'],
                'Department': ['Sales', 'Marketing', 'Sales', 'Finance', 'IT'],
                'Salary': [60000, 75000, 62000, 80000, 70000],
                'Age': [25, 30, 28, 35, 28]}
project_data = {'ProjectID': [101, 102, 103, 104],
                'ProjectName': ['Alpha', 'Beta', 'Gamma', 'Delta'],
                'Department': ['Sales', 'Marketing', 'Sales', 'Finance'],
                'Budget': [100000, 150000, 120000, 200000]}
employee_df = pd.DataFrame(employee_data)
project_df = pd.DataFrame(project_data)

# Merge the DataFrames
merged_df = pd.merge(employee_df, project_df, on='Department', how='left')

# Calculate the average salary for each department
avg_age = merged_df.groupby('Department')['Age'].mean()

# Filter employees older than the average age in their department
filtered_df = merged_df[merged_df.apply(lambda row: row['Age'] > avg_age[row['Department']], axis=1)]

# Group by ProjectName and calculate the sum of Salary and Budget
grouped_df = filtered_df.groupby('ProjectName').agg({'Salary': 'sum', 'Budget': 'sum'})

# Display the result
print(grouped_df.head())