# 04_feature_engineering.ipynb

**Goals:**  
1. Load sentiment-tagged Reddit data and sales figures  
2. Create binary flags for each feature mention  
3. Compute per-feature metrics (mention count, average polarity) by version  
4. Assemble one row per Fold version with all predictors + target sales  
5. Save the modeling dataset to `data/processed/model_data.csv`

In [33]:
import pandas as pd
import os

# Ensure the raw data folder exists
os.makedirs("data/raw", exist_ok=True)

# Build the sales dataset
sales_data = {
    "Model": ["Fold 4", "Fold 5", "Fold 6", "Fold 7"],
    "LaunchDate": ["2022-07-26", "2023-07-28", "2024-07-31", "2025-07-25"],
    "FirstMonthSales": [300000, 375000, 450000, None]  # None for Fold 7 (to predict)
}

df_sales = pd.DataFrame(sales_data)

# Save to data/raw so subsequent cells can load it
df_sales.to_csv("data/raw/fold6_fold7_sales.csv", index=False)

# Quick peek
df_sales


Unnamed: 0,Model,LaunchDate,FirstMonthSales
0,Fold 4,2022-07-26,300000.0
1,Fold 5,2023-07-28,375000.0
2,Fold 6,2024-07-31,450000.0
3,Fold 7,2025-07-25,


In [35]:
import os
import pandas as pd

# Ensure output directory exists
os.makedirs("/Users/shreychaudhary/Documents/Samsung_Fold7_Sales_Prediction/data", exist_ok=True)

# Define paths
sentiment_path = "/Users/shreychaudhary/Documents/Samsung_Fold7_Sales_Prediction/data/processed/reddit_sentiment.csv"
sales_path     = "fold6_fold7_sales.csv"  # adjust if you saved elsewhere
model_data_out = "/Users/shreychaudhary/Documents/Samsung_Fold7_Sales_Prediction/data/processed/model_data.csv"


In [39]:
sales_path = "data/raw/fold6_fold7_sales.csv"
df_sales = pd.read_csv(sales_path)

In [41]:
# Load sentiment data
df = pd.read_csv(sentiment_path)

# Load sales data
df_sales = pd.read_csv(sales_path)

print("Sentiment records:", df.shape)
print("Sales records:", df_sales.shape)

Sentiment records: (1884, 11)
Sales records: (4, 3)


In [43]:
# List of features we tracked
features = [
    "battery", "camera", "price", "display", "hinge",
    "performance", "durability", "software", "design", "size", "weight"
]

In [45]:
# For each feature, add a 1/0 column if mentioned
for feat in features:
    col = f"mention_{feat}"
    df[col] = df["Clean_Text"].str.contains(fr"\b{feat}\b", case=False, regex=True).astype(int)

# Quick check
df.filter(regex="mention_").head(3)

Unnamed: 0,mention_battery,mention_camera,mention_price,mention_display,mention_hinge,mention_performance,mention_durability,mention_software,mention_design,mention_size,mention_weight
0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,0,1,0,0,0


In [47]:
# Initialize summary list
summary = []

for version, sub in df.groupby("Version"):
    # overall metrics
    total_posts    = len(sub)
    overall_mentions = sub[[f"mention_{f}" for f in features]].sum().sum()
    overall_polarity = sub["Polarity"].mean()
    
    row = {
        "Version": version,
        "TotalPosts": total_posts,
        "OverallMentions": overall_mentions,
        "OverallPolarity": overall_polarity
    }
    
    # per-feature metrics
    for feat in features:
        mcol = f"mention_{feat}"
        mentions = sub[mcol].sum()
        # compute avg polarity only for posts mentioning the feature
        if mentions > 0:
            avg_pol = sub.loc[sub[mcol] == 1, "Polarity"].mean()
        else:
            avg_pol = 0.0
        row[f"{feat}_mentions"] = mentions
        row[f"{feat}_polarity"] = avg_pol
    
    summary.append(row)

df_feat = pd.DataFrame(summary)
df_feat

Unnamed: 0,Version,TotalPosts,OverallMentions,OverallPolarity,battery_mentions,battery_polarity,camera_mentions,camera_polarity,price_mentions,price_polarity,...,durability_mentions,durability_polarity,software_mentions,software_polarity,design_mentions,design_polarity,size_mentions,size_polarity,weight_mentions,weight_polarity
0,Fold 4,519,595,0.092322,102,0.112972,91,0.124858,68,0.131462,...,21,0.103248,43,0.118108,45,0.123725,50,0.114349,20,0.142453
1,Fold 5,493,592,0.107465,107,0.133532,88,0.137249,63,0.121108,...,22,0.113952,52,0.136064,42,0.141732,47,0.110924,25,0.131902
2,Fold 6,485,487,0.122141,92,0.148525,85,0.153839,47,0.146964,...,20,0.135441,37,0.124256,33,0.123294,49,0.123152,17,0.13752
3,Fold 7,387,386,0.094129,78,0.131849,69,0.126338,41,0.137222,...,12,0.090985,27,0.120748,29,0.132759,32,0.119361,12,0.14037


In [49]:
# df_sales has columns: Model, LaunchDate, FirstMonthSales
# Ensure Version names match
df_sales = df_sales.rename(columns={"Model": "Version"})

# Merge
df_model = df_feat.merge(df_sales, on="Version", how="left")

# Inspect
df_model.head()

Unnamed: 0,Version,TotalPosts,OverallMentions,OverallPolarity,battery_mentions,battery_polarity,camera_mentions,camera_polarity,price_mentions,price_polarity,...,software_mentions,software_polarity,design_mentions,design_polarity,size_mentions,size_polarity,weight_mentions,weight_polarity,LaunchDate,FirstMonthSales
0,Fold 4,519,595,0.092322,102,0.112972,91,0.124858,68,0.131462,...,43,0.118108,45,0.123725,50,0.114349,20,0.142453,2022-07-26,300000.0
1,Fold 5,493,592,0.107465,107,0.133532,88,0.137249,63,0.121108,...,52,0.136064,42,0.141732,47,0.110924,25,0.131902,2023-07-28,375000.0
2,Fold 6,485,487,0.122141,92,0.148525,85,0.153839,47,0.146964,...,37,0.124256,33,0.123294,49,0.123152,17,0.13752,2024-07-31,450000.0
3,Fold 7,387,386,0.094129,78,0.131849,69,0.126338,41,0.137222,...,27,0.120748,29,0.132759,32,0.119361,12,0.14037,2025-07-25,


In [51]:
df_model.to_csv(model_data_out, index=False)
print("Modeling data saved to", model_data_out)

Modeling data saved to /Users/shreychaudhary/Documents/Samsung_Fold7_Sales_Prediction/data/processed/model_data.csv
