# Data Transformation

After cleaning, we transform data to make it useful for analysis.

We can:<br>
- create new columns
- apply logic
- map values
- replace values

In [1]:
import pandas as pd

sales = pd.read_csv("../data/processed/cleaned_sales.csv")
sales.head()

Unnamed: 0,order_id,customer_id,product,category,price,quantity,city,date
0,1001,C101,laptop,Electronics,55000,1,Delhi,2024-01-05
1,1002,C102,phone,Electronics,20000,2,Mumbai,2024-01-06
2,1003,C103,shoes,Fashion,3000,1,Pune,2024-01-07
3,1004,C101,headphones,Electronics,2000,3,Delhi,2024-01-07
4,1005,C104,tshirt,Fashion,800,2,Bangalore,2024-01-08


## Creating New Column

In [2]:
sales["total_amount"] = sales["price"] * sales["quantity"]
sales.head()

Unnamed: 0,order_id,customer_id,product,category,price,quantity,city,date,total_amount
0,1001,C101,laptop,Electronics,55000,1,Delhi,2024-01-05,55000
1,1002,C102,phone,Electronics,20000,2,Mumbai,2024-01-06,40000
2,1003,C103,shoes,Fashion,3000,1,Pune,2024-01-07,3000
3,1004,C101,headphones,Electronics,2000,3,Delhi,2024-01-07,6000
4,1005,C104,tshirt,Fashion,800,2,Bangalore,2024-01-08,1600


## Apply Function

In [4]:
def price_category(price):
    if price > 30000:
        return "High"
    elif price > 5000:
        return "Medium"
    else:
        return "Low"

sales["price_level"] = sales["price"].apply(price_category)
sales.head()

Unnamed: 0,order_id,customer_id,product,category,price,quantity,city,date,total_amount,price_level
0,1001,C101,laptop,Electronics,55000,1,Delhi,2024-01-05,55000,High
1,1002,C102,phone,Electronics,20000,2,Mumbai,2024-01-06,40000,Medium
2,1003,C103,shoes,Fashion,3000,1,Pune,2024-01-07,3000,Low
3,1004,C101,headphones,Electronics,2000,3,Delhi,2024-01-07,6000,Low
4,1005,C104,tshirt,Fashion,800,2,Bangalore,2024-01-08,1600,Low


## Map Values

In [5]:
city_codes = {
    "Delhi": "D",
    "Mumbai": "M",
    "Pune": "P",
    "Bangalore": "B",
    "Chennai": "C"
}

sales["city_code"] = sales["city"].map(city_codes)
sales.head()

Unnamed: 0,order_id,customer_id,product,category,price,quantity,city,date,total_amount,price_level,city_code
0,1001,C101,laptop,Electronics,55000,1,Delhi,2024-01-05,55000,High,D
1,1002,C102,phone,Electronics,20000,2,Mumbai,2024-01-06,40000,Medium,M
2,1003,C103,shoes,Fashion,3000,1,Pune,2024-01-07,3000,Low,P
3,1004,C101,headphones,Electronics,2000,3,Delhi,2024-01-07,6000,Low,D
4,1005,C104,tshirt,Fashion,800,2,Bangalore,2024-01-08,1600,Low,B


## Replace Values

In [6]:
sales["category"] = sales["category"].replace({"Electronics": "Tech"})
sales.head()

Unnamed: 0,order_id,customer_id,product,category,price,quantity,city,date,total_amount,price_level,city_code
0,1001,C101,laptop,Tech,55000,1,Delhi,2024-01-05,55000,High,D
1,1002,C102,phone,Tech,20000,2,Mumbai,2024-01-06,40000,Medium,M
2,1003,C103,shoes,Fashion,3000,1,Pune,2024-01-07,3000,Low,P
3,1004,C101,headphones,Tech,2000,3,Delhi,2024-01-07,6000,Low,D
4,1005,C104,tshirt,Fashion,800,2,Bangalore,2024-01-08,1600,Low,B


In [7]:
## Convert Column Type

In [8]:
sales["quantity"] = sales["quantity"].astype(int)
sales.dtypes

order_id        int64
customer_id       str
product           str
category          str
price           int64
quantity        int64
city              str
date              str
total_amount    int64
price_level       str
city_code         str
dtype: object

## Save Transformed Data

In [9]:
sales.to_csv("../data/processed/transformed_sales.csv", index=False)

## Conclusion

Transformation prepares data for aggregation and analysis.