In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression



pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
ad_test_research = pd.read_excel("data/Datathon@MetuStatClub Final.xlsx", sheet_name="Reklam Test Araştırması")

In [3]:
# clean the data
ad_test_research = ad_test_research.replace(r'^[xX]+$', np.nan, regex=True)


In [4]:
cat_features = [
    "Product status",
    "Ad format",
]

drop_features = [ # all of these features has only NaN values
    "BOX",
    "BOX.1",
    "Portrays Top",
    "Portrays Top2",
    "Will win awards Top",
    "Will win awards Top2",
    "Is interesting Top",
    "Is interesting Top2",
    "Is offensive Top",
    "Is offensive Top2",
    "Is silly Top",
    "Is silly Top2",
    "Is humourus Top",
    "Is humourus Top2",
    "Is clever Top",
    "Is clever Top2",
    "Is lively Top",
    "Is lively Top2",
    "Is imaginative Top",
    "Is imaginative Top2",
    "Is convincing Top",
    "Is convincing Top2",
    "P_Relevance Top",
    "P_Relevance Top2",
    "P_Differentiation Top",
    "P_Differentiation Top2",
    "P_Popularity Top",
    "P_Popularity Top2",
    "P_Familiarity Top",
    "P_Familiarity Top2",
    "P_Quality Top",
    "P_Quality Top2",
    "P_Commitment Top (Only Brand for me)",
    "P_Commitment Top2 (Only Brand for me)",
    "Is warm Top2", # these two are not entirely null but mostly null
    "Is warm Top",
]

df = pd.get_dummies(data=ad_test_research, columns=cat_features, dtype=np.int8)
df = df.drop(drop_features, axis=1)

In [5]:
rows_to_discard = df.isna().sum(axis=1).sort_values(ascending=False)[:2]
print(f"{len(rows_to_discard)} rows discarded that had {list(rows_to_discard)[0] / df.shape[1]}% null values")
df = df.drop(rows_to_discard.index.tolist(), axis=0)


2 rows discarded that had 0.576271186440678% null values


In [6]:
print("Number of empty values in each column:")
df.isna().sum(axis=0).sort_values(ascending=False)[:10]

Number of empty values in each column:


Appetizing Top                    88
Appetizing Top2                   88
Chance_Opinion Top2               14
Chance_Opinion Top                14
Mesaj Hitap (Top2Box)             13
Mesaj İletimi (Top2Box)           12
Mesaj İnandırıcılığı (Top2Box)    12
Told important Top2                2
Told important Top                 2
Likeability Bottom                 2
dtype: int64

In [7]:
# fill the null values with mean
df = df.fillna(df.mean())

In [8]:
df_top = df.filter(regex='Top$', axis=1)
df_top2 = df.filter(regex='Top2$', axis=1)

df_top1 = df_top.rename(columns=lambda x: x.replace(' Top', ''))
df_top2 = df_top2.rename(columns=lambda x: x.replace(' Top2', ''))


In [9]:
df.shape, df_top1.shape, df_top2.shape

((153, 59), (153, 21), (153, 21))

## Correlation 

In [16]:
def plot_correlation(df):
    correlation = df.corr()
    correlation = correlation["Notice & remember"].sort_values(ascending=False)
    fig = px.bar(y=correlation.index, x=correlation.values, orientation="h", title="Correlation with Notice & remember", labels={"x": "Correlation", "y": "Features"}, height=800, width=1000)
    fig.show()
    # save the plot
    fig.write_image("plots/correlation.png")

plot_correlation(df_top1)



According to this graph 3 most highly correlated features are:
- People will talk about
- Is Unique
- Like to see on TV

In [17]:
plot_correlation(df_top2)

On the other hand, according to df_top2 (which means 4 or 5 star rating) 3 most highly correlated features are:
- Likeability
- People will talk about
- Like to see on TV

But on the other hand this graph also show negative correlation
- Is Irritating
- Is Confusing

That means trying to make a less irritating and confusing ad will also help to get a higher "Notice & Remember" rating.

## Feature Importance

In [21]:
# Feature Importance
def plot_feature_importance(df, model):
    X = df.drop(["Notice & remember"], axis=1)
    y = df["Notice & remember"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)

    feature_importance = pd.DataFrame({"feature": X.columns, "importance": model.feature_importances_}).sort_values(by="importance", ascending=False)

    fig = px.bar(y=feature_importance["feature"], x=feature_importance["importance"], orientation="h", title="Feature Importance", labels={"x": "Importance", "y": "Features"}, height=800, width=1000)
    fig.show()
    # save the plot
    fig.write_image("plots/feature_importance.png")


plot_feature_importance(df_top1, RandomForestRegressor(n_estimators=100, random_state=42))


Feature Importance is a technique to select the most important features from the dataset. It is a statistical method that helps us to select the most important features that we can use to train our model. Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.

The difference between corralation and Feature Importance is that correlation is a measure of the linear relationship between two continuous variables. Feature Importance is a measure of the information gain from a feature in a dataset.

So if you want to predict the "Notice & Remember" rating of an ad, you should focus on the features that have the highest correlation and feature importance.

In [22]:
plot_feature_importance(df_top2, RandomForestRegressor(n_estimators=100, random_state=42))

## Mutual Information

In [18]:
# lets draw a graph out of mutual information 

def plot_mutual_information(df):
    X = df.drop(["Notice & remember"], axis=1)
    y = df["Notice & remember"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    mi = mutual_info_regression(X_train, y_train)
    mi = pd.Series(mi)
    mi.index = X_train.columns
    results = mi.sort_values(ascending=False)[:10]

    fig = px.bar(y=results.index, x=results.values, orientation="h", title="Mutual Information", labels={"x": "Mutual Information", "y": "Features"}, height=800, width=1000)
    fig.show()
    # save the plot
    fig.write_image("plots/mutual_information.png")
    

plot_mutual_information(df_top1)

Mutual information (MI) between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.

The difference between correlation and mutual information is that correlation measures linear relationship between two continuous variables. Mutual information measures the information that X and Y share: it measures how much knowing one of these variables reduces uncertainty about the other.

But in a semantic way, correlation is a measure of how much two variables change together, while mutual information is a measure of how much knowing one variable tells you about the other.



In [20]:
plot_mutual_information(df_top2)