# Community Analysis

## Table of Contents

   1. [imports](#imports)
   2. [make dataframes](#make-dataframes)
   2. [analyze mean similarity communities](#analyze-mean-similarity-communities)
   3. [analyze feature group communities](#analyze-feature-group-communities)

## imports

In [None]:
import pandas as pd
import numpy as np

import re

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from umap import UMAP

## make dataframes
build from

   - metascripts
   - feature cosims dict
   - features dict

### descriptions and communities

In [None]:
descriptions = [re.search('(?<=\":).+(?=,\")', string).group(0) for string in pd.read_csv('../data/graph_coms_df.csv')['s'].values]
communities = [int(re.search('(?<=\"meanCommunity\":)\d+', string).group(0)) for string in pd.read_csv('../data/graph_coms_df.csv')['s'].values]
descom_df = pd.DataFrame({
        'description': descriptions,
        'community':communities
    })

graph_coms_df = graph_df.merge(descom_df, left_on = 'description selection', right_on = 'description').drop(columns = 'description')
graph_coms_df.to_csv('../data/graph_coms_meta.csv', index = False)

In [None]:
descom_df['community'].value_counts(normalize = True)

In [None]:
community_list = descom_df['community'].unique()
com = community_list[0]
descom_df[descom_df['community'] == com].sort_values('description')[40:60]

In [None]:
comedians_between = (metascripts.merge(descom_df)
                     .groupby('artist')
                     ['community']
                     .nunique() > 1).sum()

comedians_multiple = (metascripts.groupby('artist')['fullTitle'].count() > 1).sum()
print(f'{comedians_between} out of {comedians_multiple}, i.e, {round(100*comedians_between/comedians_multiple,2)}%, of comedians with multiple shows in the dataset had shows in different communities.')

### all similarities with communities
depends upon metascripts and feature_cosims_dict

In [None]:
feature_cosims_dict.keys()

In [None]:
all_features = pd.DataFrame()
for feature_group in feature_cosims_dict.keys():
    try:
        features = (feature_cosims_dict[feature_group]
                            .set_index(metascripts['description'])
                            .melt(var_name = 'description comparison', 
                                    value_name = f'{feature_group} similarity', 
                                    ignore_index = False)
                                .reset_index()
                                .rename(columns = {'description': 'description selection'})
                    )
        all_features = all_features.merge(features)
    except:
        all_features = (feature_cosims_dict[feature_group]
                            .set_index(metascripts['description'])
                            .melt(var_name = 'description comparison', 
                                    value_name = f'{feature_group} similarity', 
                                    ignore_index = False)
                                .reset_index()
                                .rename(columns = {'description': 'description selection'})
                    )

In [None]:
all_features['mean similarity'] = all_features.mean(axis = 1)

In [None]:
# example
# all_features[all_features['description selection'] == 'Dave Chappelle: 8:46 – Transcript'].nlargest(6, 'mean similarity')

In [None]:
metacols = ['description', 'link', 'script characters', 'id',
           'artist', 'title', 'fullTitle', 'year', 'image', 'releaseDate',
           'runtimeMins', 'runtimeStr', 'awards', 'genres',
           'companies', 'contentRating', 'imDbRating',
           'imDbRatingVotes', 'similars', 'languages']

metacosims = (all_features
                .merge(metascripts[metacols].drop(columns = 'similars'),
                       left_on = 'description selection',
                       right_on = 'description')
                .drop(columns = 'description')
                .merge(metascripts[metacols].drop(columns = 'similars'), 
                       left_on = 'description comparison', 
                       right_on = 'description',
                       suffixes = (" selection", " comparison"))
    )

keepind = [ind for ind, col in enumerate(metacosims.columns) if ind != 1]
metacosims = metacosims.iloc[:, keepind]

In [None]:
# with sqlite3.connect('../StandupRecommenderShiny/data/metacosims_recommender.sqlite') as db:
#      metacosims.to_sql('metacosims', db, if_exists = 'append', index = False)

In [None]:
graph_df_all_features = (all_features.loc[all_features['mean similarity'] < 0.99999]
                            .sort_values(['mean similarity', 'description selection', 'description comparison'], ascending = [False, True, True])
                            .drop_duplicates(subset = 'mean similarity', keep = 'first')
        )
graph_df_all_features.to_csv('../data/graph_df_all_features.csv', index = False)

### feature-groups with communities
depends upon metascripts and features_dict

In [None]:
feature_mean_coms = {group: pd.concat([df, metascripts['description']], axis = 1).merge(descom_df) for group, df in features_dict.items()}
feature_mean_coms.keys()

In [None]:
def feature_reduction_map(feature_mean_com_group, mapping = 'umap', show_component_weights = False, component_x = 0, component_y = 1):
    
    if mapping == 'umap':
        mapper = UMAP()
    else:
        mapper = PCA()
    
    X = feature_mean_com_group.drop(columns = ['description', 'community'])
    feature_group_map = mapper.fit_transform(X)

    fig = px.scatter(x = feature_group_map[:,component_x],
               y = feature_group_map[:,component_y],
               color=feature_mean_com_group['community'].astype(str),
               hover_data = [feature_mean_com_group['description']],
               template = 'simple_white'
               )
    if show_component_weights:
        comp_x_weights = pca_component_weights(feature_mean_com_group, component_x)
        comp_y_weights = pca_component_weights(feature_mean_com_group, component_y)
        print(comp_x_weights,'\n\n', comp_y_weights)
        return fig
    else:
        return fig

def pca_component_weights(feature_mean_com_group, component = 0):

    mapper = PCA()
    X = feature_mean_com_group.drop(columns = ['description', 'community'])
    feature_group_map = mapper.fit_transform(X)

    component_num = component
    
    components = pd.DataFrame({
        'description': pca.feature_names_in_,
        'coefficient': pca.components_[component_num]
    })

    return components.sort_values('coefficient', ascending=False)
    
feature_reduction_map(feature_mean_coms['sur'].fillna(0), 
                      mapping = 'pca', 
                      show_component_weights = True,
                      component_x = 0, 
                      component_y = 1)

## unsupervised analysis of mean cosine similarities and communities

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from umap import UMAP

In [None]:
cosims_means_com = (cosims_means_df.merge(descom_df.set_index('description'), 
                                          left_index = True, 
                                          right_index = True
                                         )
                   )

cosims_means_com['community'] = cosims_means_com['community'].astype(str)

X = cosims_means_com.drop(columns = 'community')

### pca & feature importances