## Part 1: Tidying and Reshaping Data

### Part 1.1

In [48]:
import numpy as np
import pandas as pd

data = pd.read_csv("https://raw.githubusercontent.com/ktxdev/AIM-5001/main/M11/1.%20Data/M11_Data.csv")

data.head()

Unnamed: 0,Month,Category,Caltex,Gulf,Mobil
0,Open,Engine Oil,140 : 000,199 : 000,141 : 000
1,,GearBox Oil,198 : 000,132 : 000,121 : 000
2,Jan,Engine Oil,170 : 103,194 : 132,109 : 127
3,,GearBox Oil,132 : 106,125 : 105,191 : 100
4,Feb,Engine Oil,112 : 133,138 : 113,171 : 101


In [50]:
data = pd.melt(data, id_vars=['Month', 'Category'], var_name="Suppliers", value_name="Purchased:Consumed")
data.head()

Unnamed: 0,Month,Category,Suppliers,Purchased:Consumed
0,Open,Engine Oil,Caltex,140 : 000
1,,GearBox Oil,Caltex,198 : 000
2,Jan,Engine Oil,Caltex,170 : 103
3,,GearBox Oil,Caltex,132 : 106
4,Feb,Engine Oil,Caltex,112 : 133


In [61]:
data[['Purchased', 'Consumed']] = data['Purchased:Consumed'].str.split(":", expand=True)
data = data.drop(columns="Purchased:Consumed")
data.head()

Unnamed: 0,Month,Category,Suppliers,Purchased,Consumed
0,Open,Engine Oil,Caltex,140,0
1,,GearBox Oil,Caltex,198,0
2,Jan,Engine Oil,Caltex,170,103
3,,GearBox Oil,Caltex,132,106
4,Feb,Engine Oil,Caltex,112,133


In [64]:
data['Month'] = data['Month'].ffill()
data.head()

Unnamed: 0,Month,Category,Suppliers,Purchased,Consumed
0,Open,Engine Oil,Caltex,140,0
1,Open,GearBox Oil,Caltex,198,0
2,Jan,Engine Oil,Caltex,170,103
3,Jan,GearBox Oil,Caltex,132,106
4,Feb,Engine Oil,Caltex,112,133


### Part 1.2

In [77]:
data['Purchased'] = data['Purchased'].astype(int)
data['Consumed'] = data['Consumed'].astype(int)

def oil_remaining(x):
    return x['Purchased'].sum() - x['Consumed'].sum()

data1 = data.groupby(['Month', 'Category']).apply(oil_remaining)

data1

Month  Category   
Apr    Engine Oil      -3
       GearBox Oil    116
Feb    Engine Oil      74
       GearBox Oil    132
Jan    Engine Oil     111
       GearBox Oil    137
Jun    Engine Oil     126
       GearBox Oil     61
Mar    Engine Oil      90
       GearBox Oil    134
May    Engine Oil      99
       GearBox Oil     62
Open   Engine Oil     480
       GearBox Oil    451
dtype: int64

### Part 1.3

In [81]:
data.pivot_table(index=['Month', 'Category', 'Suppliers'], values=['Consumed','Purchased'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Consumed,Purchased
Month,Category,Suppliers,Unnamed: 3_level_1,Unnamed: 4_level_1
Apr,Engine Oil,Caltex,150,149
Apr,Engine Oil,Gulf,118,117
Apr,Engine Oil,Mobil,118,117
Apr,GearBox Oil,Caltex,125,185
Apr,GearBox Oil,Gulf,133,191
Apr,GearBox Oil,Mobil,121,119
Feb,Engine Oil,Caltex,133,112
Feb,Engine Oil,Gulf,113,138
Feb,Engine Oil,Mobil,101,171
Feb,GearBox Oil,Caltex,148,193


## Part 2: Using Your GroupBy and Data Aggregation Skills

### Load Data

In [40]:
import numpy as np
import pandas as pd

# load the data set
auto_df = pd.read_csv("https://raw.githubusercontent.com/ktxdev/AIM-5001/main/M11/1.%20Data/auto-mpg.data", delim_whitespace = True, header = None)

# add meaningful column names
auto_df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model', 'origin', 'car_name']

# replace '?' in horsepower column with 'NaN'
auto_df.horsepower.replace('?', np.nan, inplace = True)

# convert the column to numeric
auto_df["horsepower"] = pd.to_numeric(auto_df["horsepower"])

# replace origin values using a dict
auto_df.origin.replace({1: 'USA', 2: 'Asia', 3: 'Europe'}, inplace = True)
auto_df.head(10)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,USA,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,USA,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,USA,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,USA,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,USA,ford torino
5,15.0,8,429.0,198.0,4341.0,10.0,70,USA,ford galaxie 500
6,14.0,8,454.0,220.0,4354.0,9.0,70,USA,chevrolet impala
7,14.0,8,440.0,215.0,4312.0,8.5,70,USA,plymouth fury iii
8,14.0,8,455.0,225.0,4425.0,10.0,70,USA,pontiac catalina
9,15.0,8,390.0,190.0,3850.0,8.5,70,USA,amc ambassador dpl


### Part 2.1

In [18]:
auto_df.groupby(['origin', 'cylinders']).agg({'cylinders': ['size']})\
    .rename(columns={'size': 'Quantity', 'cylinders': ''})

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity
origin,cylinders,Unnamed: 2_level_1
Asia,4,63
Asia,5,3
Asia,6,4
Europe,3,4
Europe,4,69
Europe,6,6
USA,4,72
USA,6,74
USA,8,103


### Part 2.2

In [41]:
auto_df.groupby(['origin', 'model']).agg({'mpg': 'mean', 'weight': "mean"})

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,weight
origin,model,Unnamed: 2_level_1,Unnamed: 3_level_1
Asia,70,25.2,2309.2
Asia,71,28.75,2024.0
Asia,72,22.0,2573.2
Asia,73,24.0,2335.714286
Asia,74,27.0,2139.333333
Asia,75,24.5,2571.166667
Asia,76,24.25,2611.0
Asia,77,29.25,2138.75
Asia,78,24.95,2691.666667
Asia,79,30.45,2693.75


### Part 2.3

In [42]:
bins = [70, 72, 75, 77, 79, 82]
auto_df['model'] = pd.cut(auto_df['model'], bins=bins, labels=["(70.0, 72.0]", "(72.0, 75.0]", "(75.0, 77.0]", "(77.0, 80.0]", "(80.0, 82.0]"])
auto_df.groupby("model").agg({'weight': ['mean', 'size', 'max', 'median', 'min']})\
        .rename(columns={'mean': 'Average Weight', 'size': 'Count', 'max': 'Max Weight', 'median': 'Median Weight', 'min': 'Min Weight'}).stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,weight
model,Unnamed: 1_level_1,Unnamed: 2_level_1
"(70.0, 72.0]",Average Weight,3116.571429
"(70.0, 72.0]",Count,56.0
"(70.0, 72.0]",Max Weight,5140.0
"(70.0, 72.0]",Median Weight,2947.5
"(70.0, 72.0]",Min Weight,1613.0
"(72.0, 75.0]",Average Weight,3193.494845
"(72.0, 75.0]",Count,97.0
"(72.0, 75.0]",Max Weight,4997.0
"(72.0, 75.0]",Median Weight,3021.0
"(72.0, 75.0]",Min Weight,1649.0
