In [1]:
import numpy as np
from data_clean_utils import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
# If working on a Windows machine try %matplotlib inline or %matplotlib qt instead of below line
%matplotlib osx

## Clean the data 

In [3]:
df = pl.read_csv('data/contacts.csv')
# drop data that doesn't have membership value
df_member = df.drop_nulls(subset=['Membership Level'])
# clean all nulls
df_dropped = drop_columns_that_are_all_null(df_member,verbose=False)
# convert string date to dates
df_date = convert_dates(df_dropped,verbose=False)

## Do PCA in the numerical values

In [4]:
df_pca = df_date.clone()
irrelevant_cols = ['Record ID - Contact', 'Likelihood to close', 'Latitude', 'Longitude', 
                   'Event Revenue','Net Basegroup Member Growth Since MF23',
                   'Precise Basegroup Membership Net Growth for Focus Reports','Precise Basegroup Membership Net Growth Since Congress',
                   'Basegroup Membership Net Growth']
num_cols = []
for col in df_pca:
    if col.name not in irrelevant_cols:
        if col.dtype in [pl.Int64,pl.Int32,pl.Float32,pl.Float64]:
            # numerical values
            tmp_unique = col.unique().drop_nulls()
            if len(tmp_unique) > 2:
                # more than 1 unique value1, should get rid of constants
                num_cols.append(col.name)
df_pca = df_pca.select(num_cols)
df_pca = df_pca.with_columns(pl.col("*").fill_null(strategy="zero"))

print(len(df_pca))
df_pca.head()

807


Number of Sales Activities,Number of times contacted,Number of Associated Deals,Average Pageviews,Number of Pageviews,Number of Sessions,Number of Form Submissions,Number of Unique Forms Submitted,Marketing emails clicked,Marketing emails delivered,Marketing emails opened,Sends Since Last Engagement,Big Survey - Staff Progress per Employee,Big Survey has Basegroup link,Number of Big Surveys Collected,Respondant Has Big Survey CampaignActivity
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64
7,2,3,6,2735,427,6,6,8,80,38,0,2.84,53,142,1
6,4,0,0,0,0,0,0,0,34,1,11,0.0,0,0,0
0,0,0,1,7,6,1,1,5,59,13,3,0.0,0,0,1
1,1,0,0,0,1,1,1,1,45,1,34,0.0,0,0,0
2,2,0,1,8,6,1,1,9,69,56,0,0.0,0,0,0


In [5]:
variables = df_pca.to_pandas()
x = StandardScaler().fit_transform(variables)
pca = PCA(n_components=3)

x_new = pca.fit_transform(x)
coeff = np.transpose(pca.components_[0:3,:])
target = df_date.select('Membership Level').to_numpy()
feature_labels = df_pca.columns
class_labels = ['Militant','Basegroup','Consultative']

print(pca.explained_variance_ratio_*100)

[36.27529294 17.34346206 14.14349248]


## Plot PCA and feature 

In [6]:
def plot_PCA(score:np.ndarray,
             coeff:np.ndarray,
             targets:np.ndarray,
             num_features:int=None,
             feature_labels=None) -> None:
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1,projection='3d') 
    
    if num_features is None:
        num_features = len(coeff)
        
    num_features=5
    tops = (coeff ** 2).sum(axis=1).argsort()[-num_features:]
    arrows = coeff[tops]

    xs = score[:,0]
    ys = score[:,1]
    zs = score[:,2]
    scalex = 1.0/(xs.max() - xs.min())
    scaley = 1.0/(ys.max() - ys.min())
    scalez = 1.0/(zs.max() - zs.min())
    
    class_labels = np.unique(targets)
    color_dict = dict(zip(class_labels,np.arange(0,len(class_labels))))
    label_vals = [color_dict[k[0]] for k in targets]
    
    s = ax.scatter(xs * scalex,ys * scaley, zs*scalez, c=label_vals, cmap = 'jet', vmin=0,vmax=2)
    for i in range(num_features):
        ax.quiver(0, 0, 0, arrows[i,0], arrows[i,1], arrows[i,2], color = 'r',alpha = 0.5)
        if feature_labels is None:
            ax.text(arrows[i,0]* 1.15, arrows[i,1] * 1.15, arrows[i,2]*1.15, "Var"+str(i+1), color = 'k', ha = 'center', va = 'center')
        else:
            ax.text(arrows[i,0]* 1.15, arrows[i,1] * 1.15, arrows[i,2]*1.15, feature_labels[i], color = 'k', ha = 'center', va = 'center')

    fontsize = 16
    ax.set_xlabel('PC 1', fontsize=fontsize)
    ax.set_ylabel('PC 2', fontsize=fontsize)
    ax.set_zlabel('PC 3', fontsize=fontsize)
    ax.tick_params(labelsize=fontsize,width=3,length=8)
    
    # ax.spines['top'].set_visible(False)
    # ax.spines['right'].set_visible(False)
    # ax.spines['left'].set_linewidth(2)
    # ax.spines['bottom'].set_linewidth(2)
    # ax.spines['left'].set_position(('outward',10))
    # ax.spines['bottom'].set_position(('outward',10))
    
    plt.legend(s.legend_elements()[0], class_labels,fontsize=fontsize,frameon=False)


In [7]:
plot_PCA(score=x_new,
         coeff=coeff,
         targets=target,
         num_features=7,
         feature_labels=feature_labels)

In [8]:
ff = plt.figure(figsize=(8,8))
ax = ff.add_subplot(111)

s=ax.imshow(coeff,cmap='twilight_shifted')
ax.set_xticks([0,1,2])
ax.set_xticklabels(['PCA1','PCA2','PCA3'])
ax.set_yticks([i for i in range(len(df_pca.columns))])
ax.set_yticklabels(df_pca.columns)

ax.set_title(f'Explained Variance: {round(np.sum(pca.explained_variance_ratio_*100),2)}%')
# colorbar
fontsize=12
cbar =ff.colorbar(s)
cbar.ax.tick_params(labelsize=fontsize,width=1,length=6)
cbar.ax.set_ylabel('Effect in dimension', fontsize=fontsize)

Text(0, 0.5, 'Effect in dimension')