# Stage 0 Loading Data
- Outcome: Load whole dataset as `df`

In [1]:
import pandas as pd 
df = pd.read_csv('../Data/train_data_ads.csv')
print("data is loaded...") # 31.8 s

data is loaded...


# Stage 1 Pick a Advertisement Task

- Outcome: 
    - Load Task 18800 as `df_18800`
    - Drop the Columns with unique value > 1000 or non-numerical value.

In [2]:
df_18800 = df[df['task_id'] == 18800]

In [3]:
import numpy as np
# Importing the NumPy library, which provides support for large, multi-dimensional arrays and matrices,
# along with a large collection of high-level mathematical functions to operate on these arrays.

columns_to_drop = [column for column in df_18800.columns if df_18800[column].nunique() > 1000]
# Constructing a list of columns from the dataframe 'df_18800' to be dropped.
# A column is added to this list if it has more than 1000 unique values, which typically
# suggests that the column contains highly granular data, possibly not useful for analysis
# or could lead to issues like overfitting if used in machine learning models.

df_18800 = df_18800.drop(columns=columns_to_drop)
# Removing the columns identified in the 'columns_to_drop' list from 'df_18800'.
# This operation simplifies the dataframe by excluding columns with excessive uniqueness.

df_18800 = df_18800.select_dtypes(include=[np.number])
# Filtering the dataframe to include only columns that have numerical data types.
# This step is crucial for analyses that require numerical inputs, such as mathematical
# operations or statistical modeling.

print(df_18800.dtypes)
# Printing the data types of the columns remaining in the dataframe 'df_18800'.
# This is useful for verifying that the dataframe now contains only numerical columns,
# as intended after the previous filtering step.

label                 int64
age                   int64
gender                int64
residence             int64
city                  int64
city_rank             int64
series_dev            int64
series_group          int64
emui_dev              int64
device_name           int64
device_size           int64
net_type              int64
task_id               int64
adv_id                int64
creat_type_cd         int64
adv_prim_id           int64
inter_type_cd         int64
slot_id               int64
site_id               int64
spread_app_id         int64
hispace_app_tags      int64
app_second_class      int64
app_score           float64
u_refreshTimes        int64
u_feedLifeCycle       int64
dtype: object


# Stage 2 Break Dataset into Training, Holdout, Validate
- Outcome: `df_18800_train`, `df_18800_holdout`, `df_18800_val`

## Step 2.1 Check Positive Sample Size, Negative Sample Size, Label Rate

In [4]:
# Count the occurrences of each label in the 'label' column of df_18800 dataframe.
label_counts = df_18800['label'].value_counts()
# Extract the count of positive labels (label == 1) from the label_counts series.
positive_count = label_counts[1]
# Extract the count of negative labels (label == 0) from the label_counts series.
negative_count = label_counts[0]
# Calculate the rate of positive labels to negative labels.
label_rate = positive_count / negative_count
# Print the sizes of positive and negative samples along with the label rate, formatted to two decimal places.
print("Positive Sample size is {}, Negative Sample size is {}, label rate is {:.2f}".format(positive_count, negative_count, label_rate))


Positive Sample size is 398, Negative Sample size is 13020, label rate is 0.03


In [5]:
df_label_0 = df_18800[df_18800['label'] == 0]
df_label_1 = df_18800[df_18800['label'] == 1]

# 對 label 為 0 的數據集進行分割
total_samples_label_0 = len(df_label_0)
train_size_label_0 = int(0.4 * total_samples_label_0)
holdout_size_label_0 = int(0.4 * total_samples_label_0)
validate_size_label_0 = total_samples_label_0 - train_size_label_0 - holdout_size_label_0

df_label_0_train = df_label_0.sample(n=train_size_label_0, random_state=42)
df_label_0_hold = df_label_0.drop(df_label_0_train.index).sample(n=holdout_size_label_0, random_state=42)
df_label_0_val = df_label_0.drop(df_label_0_train.index).drop(df_label_0_hold.index)

# 對 label 為 1 的數據集進行分割
total_samples_label_1 = len(df_label_1)
train_size_label_1 = int(0.4 * total_samples_label_1)
holdout_size_label_1 = int(0.4 * total_samples_label_1)
validate_size_label_1 = total_samples_label_1 - train_size_label_1 - holdout_size_label_1

df_label_1_train = df_label_1.sample(n=train_size_label_1, random_state=42)
df_label_1_hold = df_label_1.drop(df_label_1_train.index).sample(n=holdout_size_label_1, random_state=42)
df_label_1_val = df_label_1.drop(df_label_1_train.index).drop(df_label_1_hold.index)

In [6]:
# 將 label 為 0 和 label 為 1 的 train 子數據集合併成 df_train
df_train = pd.concat([df_label_0_train, df_label_1_train])

# 將 label 為 0 和 label 為 1 的 holdout 子數據集合併成 df_holdout
df_holdout = pd.concat([df_label_0_hold, df_label_1_hold])

# 將 label 為 0 和 label 為 1 的 validate 子數據集合併成 df_val
df_val = pd.concat([df_label_0_val, df_label_1_val])

In [7]:
import pandas as pd
def calculate_label_rate(df):
    
    # Get the total number of samples
    total_samples = len(df)
    
    # Count the occurrences of each label in the 'label' column of df dataframe.
    label_counts = df['label'].value_counts()
    # Extract the count of positive labels (label == 1) from the label_counts series.
    positive_count = label_counts.get(1, 0)
    # Extract the count of negative labels (label == 0) from the label_counts series.
    negative_count = label_counts.get(0, 0)
    # Calculate the rate of positive labels to negative labels.
    label_rate = positive_count / negative_count if negative_count != 0 else 0
    
    # Print the sizes of positive and negative samples along with the label rate, formatted to two decimal places.
    print("Total Sample size is {}, Positive Sample size is {}, Negative Sample size is {}, label rate is {:.2f}".format(total_samples, positive_count, negative_count, label_rate))

In [8]:
calculate_label_rate(df_18800)
calculate_label_rate(df_train)
calculate_label_rate(df_holdout)
calculate_label_rate(df_val)

Total Sample size is 13418, Positive Sample size is 398, Negative Sample size is 13020, label rate is 0.03
Total Sample size is 5367, Positive Sample size is 159, Negative Sample size is 5208, label rate is 0.03
Total Sample size is 5367, Positive Sample size is 159, Negative Sample size is 5208, label rate is 0.03
Total Sample size is 2684, Positive Sample size is 80, Negative Sample size is 2604, label rate is 0.03


# Stage 3 Save the three sample dataset

In [9]:
# 將df_train的行隨機重新排序
df_train_shuffled = df_train.sample(frac=1).reset_index(drop=True)

# 將df_holdout的行隨機重新排序
df_holdout_shuffled = df_holdout.sample(frac=1).reset_index(drop=True)

# 將df_val的行隨機重新排序
df_val_shuffled = df_val.sample(frac=1).reset_index(drop=True)

calculate_label_rate(df_train_shuffled)
calculate_label_rate(df_holdout_shuffled)
calculate_label_rate(df_val_shuffled)

Total Sample size is 5367, Positive Sample size is 159, Negative Sample size is 5208, label rate is 0.03
Total Sample size is 5367, Positive Sample size is 159, Negative Sample size is 5208, label rate is 0.03
Total Sample size is 2684, Positive Sample size is 80, Negative Sample size is 2604, label rate is 0.03


In [10]:
# 儲存隨機排序後的df_train
df_train_shuffled.to_csv('../Data/df_18800_train.csv', index=False)

# 儲存隨機排序後的df_holdout
df_holdout_shuffled.to_csv('../Data/df_18800_holdout.csv', index=False)

# 儲存隨機排序後的df_val
df_val_shuffled.to_csv('../Data/df_18800_val.csv', index=False)