In [77]:
import numpy as np
from data_clean_utils import *
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
# If working on a Windows machine try %matplotlib inline or %matplotlib qt instead of below line
%matplotlib osx

In [87]:
df = pl.read_csv('data/contacts.csv')
# clean all nulls
df_dropped = drop_columns_that_are_all_null(df,verbose=False)
# convert string date to dates
df_date = convert_dates(df_dropped,verbose=False)
# drop data that doesn't have membership value
df_member = df_date.drop_nulls(subset=['Membership Level'])

In [102]:
df_tsne = df_member.clone()
irrelevant_cols = ['Record ID - Contact', 'Likelihood to close', 'Latitude', 'Longitude', 'Marketing emails bounced'
                   'Event Revenue','Net Basegroup Member Growth Since MF23',
                   'Precise Basegroup Membership Net Growth for Focus Reports','Precise Basegroup Membership Net Growth Since Congress',
                   'Basegroup Membership Net Growth']
num_cols = []
for col in df_tsne:
    if col.name not in irrelevant_cols:
        if col.dtype in [pl.Int64,pl.Int32,pl.Float32,pl.Float64]:
            # numerical values
            tmp_unique = col.unique().drop_nulls()
            if len(tmp_unique) > 2:
                # more than 1 unique value1, should get rid of constants
                num_cols.append(col.name)
df_tsne = df_tsne.select(num_cols)
df_tsne = df_tsne.with_columns(pl.col("*").fill_null(strategy="zero"))

print(len(df_tsne))
df_tsne.describe()

807


describe,Number of Sales Activities,Number of times contacted,Number of Associated Deals,Average Pageviews,Number of Pageviews,Number of Sessions,Number of Form Submissions,Number of Unique Forms Submitted,Marketing emails clicked,Marketing emails delivered,Marketing emails opened,Sends Since Last Engagement,Big Survey - Staff Progress per Employee,Big Survey has Basegroup link,Number of Big Surveys Collected,Respondant Has Big Survey CampaignActivity
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0,807.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",2.105328,1.468401,0.610905,1.443618,85.092937,16.607187,2.561338,2.022305,4.317224,50.449814,16.241636,9.095415,0.063544,2.972739,3.178439,0.57373
"""std""",10.289244,4.525766,3.908035,2.265061,479.979652,54.272569,4.35957,2.608347,7.525374,28.010493,21.002241,12.845216,0.402509,19.358425,20.125299,0.652693
"""min""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,1.0,1.0,0.0,0.0,0.0,0.0
"""50%""",1.0,0.0,0.0,1.0,4.0,3.0,1.0,1.0,1.0,54.0,7.0,3.0,0.0,0.0,0.0,0.0
"""75%""",2.0,2.0,0.0,2.0,19.0,10.0,3.0,3.0,6.0,68.0,25.0,12.0,0.0,0.0,0.0,1.0
"""max""",266.0,105.0,74.0,35.0,10308.0,716.0,83.0,35.0,69.0,135.0,128.0,65.0,7.6,380.0,380.0,4.0


In [100]:
df_tsne.columns

['Number of Sales Activities',
 'Number of times contacted',
 'Number of Associated Deals',
 'Average Pageviews',
 'Number of Pageviews',
 'Number of Sessions',
 'Number of Form Submissions',
 'Number of Unique Forms Submitted',
 'Marketing emails clicked',
 'Marketing emails delivered',
 'Marketing emails opened',
 'Sends Since Last Engagement',
 'Big Survey - Staff Progress per Employee',
 'Big Survey has Basegroup link',
 'Number of Big Surveys Collected',
 'Respondant Has Big Survey CampaignActivity']

In [103]:
variables = df_tsne.to_pandas()
targets = df_member['Membership Level'].to_numpy()
likelihood = df_member['Likelihood to close'].to_numpy()
tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=5000)
tsne_results = tsne.fit_transform(variables)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 807 samples in 0.000s...
[t-SNE] Computed neighbors for 807 samples in 0.010s...
[t-SNE] Computed conditional probabilities for sample 807 / 807
[t-SNE] Mean sigma: 5.218514
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.338074
[t-SNE] KL divergence after 3700 iterations: 0.500296


In [110]:
def plot_tSNE(tsne_results:np.ndarray,targets=None,likelihood=None, is2d:str=True):
    f = plt.figure(figsize=(8,8))
    if is2d:
        ax = f.add_subplot(111)
    else:
        ax = f.add_subplot(111,projection='3d')

    if likelihood is None:
        class_labels = np.unique(targets)
        color_dict = dict(zip(class_labels,np.arange(0,len(class_labels))))
        label_vals = [color_dict[k] for k in targets]
        vmin=0
        vmax=len(class_labels)-1
    else:
        label_vals = likelihood
        vmin = 0
        vmax = 100

    cmap = mpl.colormaps.get_cmap('jet')  # viridis is the default colormap for imshow
    cmap.set_bad(color='black')

    if is2d:
        s=ax.scatter(tsne_results[:,0],tsne_results[:,1],c=label_vals, cmap = cmap, vmin=vmin,vmax=vmax)
    else:
        s=ax.scatter(tsne_results[:,0],tsne_results[:,1],tsne_results[:,2],
                     linewidtsh=2,
                     c=label_vals, 
                     edgecolors='k',
                     cmap = 'jet', 
                     vmin=0,vmax=2)
      
    annot = ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points",
                    bbox=dict(boxstyle="round", fc="w"),
                    arrowprops=dict(arrowstyle="->"))
    annot.set_visible(False)
    
    def update_annot(ind):
        pos = s.get_offsets()[ind["ind"][0]]
        annot.xy = pos
        text = f"hello"
        annot.set_text(text)
        annot.get_bbox_patch().set_facecolor('white')
        annot.get_bbox_patch().set_alpha(0.4)


    fontsize = 16
    ax.set_xlabel('tSNE-1', fontsize=fontsize)
    ax.set_ylabel('tSNE-2', fontsize=fontsize)
    if not is2d:
        ax.set_zlabel('tSNE-3', fontsize=fontsize)
    ax.tick_params(labelsize=fontsize,width=1,length=6)

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    if likelihood is None:
        plt.legend(s.legend_elements()[0], class_labels,fontsize=fontsize,frameon=False)
    else:
        # colorbar
        cbar =f.colorbar(s)
        cbar.ax.tick_params(labelsize=fontsize,width=1,length=6)
        cbar.ax.set_ylabel('Likelihood', fontsize=fontsize)
        
    f.canvas.mpl_connect("motion_notify_event", hover)


In [107]:
plot_tSNE(tsne_results,targets,likelihood)

## Now do the same on non member data

In [95]:
df_non_member = df_date.filter(pl.col('Membership Level').is_null())

In [97]:
df_tsne_non = df_non_member.clone()
irrelevant_cols = ['Record ID - Contact', 'Likelihood to close', 'Latitude', 'Longitude', 
                   'Event Revenue','Net Basegroup Member Growth Since MF23',
                   'Precise Basegroup Membership Net Growth for Focus Reports','Precise Basegroup Membership Net Growth Since Congress',
                   'Basegroup Membership Net Growth']
num_cols = []
for col in df_tsne_non:
    if col.name not in irrelevant_cols:
        if col.dtype in [pl.Int64,pl.Int32,pl.Float32,pl.Float64]:
            # numerical values
            tmp_unique = col.unique().drop_nulls()
            if len(tmp_unique) > 2:
                # more than 1 unique value1, should get rid of constants
                num_cols.append(col.name)
df_tsne_non = df_tsne_non.select(num_cols)
df_tsne_non = df_tsne_non.with_columns(pl.col("*").fill_null(strategy="zero"))

print(len(df_tsne_non))
df_tsne_non.describe()

19193


describe,Number of Sales Activities,Number of times contacted,Number of Associated Deals,Average Pageviews,Number of Pageviews,Number of Sessions,Number of Form Submissions,Number of Unique Forms Submitted,Marketing emails bounced,Marketing emails clicked,Marketing emails delivered,Marketing emails opened,Sends Since Last Engagement,Big Survey - Staff Progress per Employee,Big Survey has Basegroup link,Number of Big Surveys Collected,Respondant Has Big Survey CampaignActivity
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0,19193.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",0.055385,0.024488,0.010941,0.437034,1.393268,0.795655,0.420153,0.367426,0.01834,0.278383,7.93706,1.541395,3.352889,8.2e-05,0.0124,0.004116,0.041109
"""std""",2.854906,0.272821,0.541701,1.115051,12.245644,3.733269,0.861074,0.651133,0.152023,1.55689,14.675793,5.376059,7.658132,0.004914,0.11573,0.245709,0.221378
"""min""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""50%""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""75%""",0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,0.0
"""max""",391.0,12.0,59.0,49.0,1225.0,334.0,23.0,12.0,6.0,42.0,88.0,73.0,42.0,0.48,3.0,24.0,4.0


In [101]:
df_tsne_non.columns

['Number of Sales Activities',
 'Number of times contacted',
 'Number of Associated Deals',
 'Average Pageviews',
 'Number of Pageviews',
 'Number of Sessions',
 'Number of Form Submissions',
 'Number of Unique Forms Submitted',
 'Marketing emails bounced',
 'Marketing emails clicked',
 'Marketing emails delivered',
 'Marketing emails opened',
 'Sends Since Last Engagement',
 'Big Survey - Staff Progress per Employee',
 'Big Survey has Basegroup link',
 'Number of Big Surveys Collected',
 'Respondant Has Big Survey CampaignActivity']

In [115]:
# sample 2000
sample_df_tsne_non = df_tsne_non.sample(n=2000)

In [111]:
variables_non = sample_df_tsne_non.to_pandas()
likelihood = df_non_member['Likelihood to close'].to_numpy()
tsne_non = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=5000)
tsne_results_non = tsne.fit_transform(variables_non)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 19193 samples in 0.001s...
[t-SNE] Computed neighbors for 19193 samples in 0.397s...
[t-SNE] Computed conditional probabilities for sample 1000 / 19193
[t-SNE] Computed conditional probabilities for sample 2000 / 19193
[t-SNE] Computed conditional probabilities for sample 3000 / 19193
[t-SNE] Computed conditional probabilities for sample 4000 / 19193
[t-SNE] Computed conditional probabilities for sample 5000 / 19193
[t-SNE] Computed conditional probabilities for sample 6000 / 19193
[t-SNE] Computed conditional probabilities for sample 7000 / 19193
[t-SNE] Computed conditional probabilities for sample 8000 / 19193
[t-SNE] Computed conditional probabilities for sample 9000 / 19193
[t-SNE] Computed conditional probabilities for sample 10000 / 19193
[t-SNE] Computed conditional probabilities for sample 11000 / 19193
[t-SNE] Computed conditional probabilities for sample 12000 / 19193
[t-SNE] Computed conditional probabilities for sam

In [114]:
df_tsne_non['Likelihood to close']

nan

In [112]:
plot_tSNE(tsne_results_non,likelihood=likelihood)