# Community Analysis

## Table of Contents

   1. [imports](#imports)
   2. [analyze mean similarity communities](#analyze-mean-similarity-communities)
   3. [analyze feature group communities](#analyze-feature-group-communities)

## imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import plotly.express as px

import re

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from umap import UMAP
from sklearn.decomposition import PCA



In [2]:
with open(f'../data/metascripts_df_sentiment.pickle', 'rb') as file:
    metascripts = pickle.load(file)

In [3]:
with open('../data/features_dict.pickle', 'rb') as file:
    features_dict = pickle.load(file)

In [4]:
with open('../data/feature_cosims_dict.pickle', 'rb') as file:
    feature_cosims_dict = pickle.load(file)

In [116]:
with open('../data/metacosims_df.pickle', 'rb') as file:
    metacosims = pickle.load(file)

In [111]:
with open('../data/feature_communities/feature_coms_dict.pickle', 'rb') as file:
    feature_coms = pickle.load(file)

In [112]:
with open('../data/feature_communities/all_coms_df.pickle', 'rb') as file:
    all_coms = pickle.load(file)

## exploration of principle components and feature (show) weights for feature similarities

In [None]:
def pca_component_weights(feature_mean_com_group, component = 0):
    """
    Run PCA on feature mean community group. 
    Return a df of which shows (descriptions) were the most influential features for the selected component.
    """
    mapper = PCA()
    X = feature_mean_com_group.drop(columns = ['description', 'community'])
    feature_group_map = mapper.fit_transform(X)
    
    components = pd.DataFrame({
        'description': mapper.feature_names_in_,
        'coefficient': mapper.components_[component]
    })

    return components.sort_values('coefficient', ascending=False)

def feature_reduction_map(feature_mean_com_group, mapping = 'umap', show_component_weights = False, component_x = 0, component_y = 1):
    """
    Choose feature reduction method: UMAP or PCA.
    Show scatter chart of selected components colored by the mean similarity communities.
    """
    if mapping == 'umap':
        mapper = UMAP()
    else:
        mapper = PCA()
    
    X = feature_mean_com_group.drop(columns = ['description', 'community'])
    feature_group_map = mapper.fit_transform(X)

    fig = px.scatter(x = feature_group_map[:,component_x],
               y = feature_group_map[:,component_y],
               color=feature_mean_com_group['community'].astype(str),
               hover_data = [feature_mean_com_group['description']],
               template = 'simple_white'
               )
    if show_component_weights:
        comp_x_weights = pca_component_weights(feature_mean_com_group, component_x)
        comp_y_weights = pca_component_weights(feature_mean_com_group, component_y)
        print(comp_x_weights,'\n\n', comp_y_weights)
        return fig
    else:
        return fig

In [None]:
feature_reduction_map(feature_mean_coms['sur'].fillna(0), 
                      mapping = 'pca', 
                      show_component_weights = True,
                      component_x = 0, 
                      component_y = 1)

## unsupervised analysis of mean cosine similarities and communities

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from umap import UMAP

In [None]:
cosims_means_com = (cosims_means_df.merge(descom_df.set_index('description'), 
                                          left_index = True, 
                                          right_index = True
                                         )
                   )

cosims_means_com['community'] = cosims_means_com['community'].astype(str)

X = cosims_means_com.drop(columns = 'community')

### pca & feature importances