# Community Analysis

## Table of Contents

   1. [imports](#imports)
   2. [make dataframes](#make-dataframes)

## imports

In [1]:
import pandas as pd
import numpy as np
import pickle
import plotly.express as px

import re

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from umap import UMAP
from sklearn.decomposition import PCA



In [2]:
with open(f'../data/metascripts_df_sentiment.pickle', 'rb') as file:
    metascripts = pickle.load(file)

In [3]:
with open('../data/features_dict.pickle', 'rb') as file:
    features_dict = pickle.load(file)

In [4]:
with open('../data/feature_cosims_dict.pickle', 'rb') as file:
    feature_cosims_dict = pickle.load(file)

## make dataframes

   - descom_df: show descriptions and mean communities from Louvain algorithm
   - all_features: each show pair and all feature-group similarities
   - metacosims: all_features + metadata
   - graph_all_features: all_features deduplicated for graph database upload
   - feature_mean_coms: each feature group with description (show) labels and corresponding mean communities from Louvain algorithm

### descom_df: descriptions and communities

In [5]:
descriptions = [re.search('(?<=\":).+(?=,\")', string).group(0) for string in pd.read_csv('../data/graph_coms_df.csv')['s'].values]
communities = [int(re.search('(?<=\"meanCommunity\":)\d+', string).group(0)) for string in pd.read_csv('../data/graph_coms_df.csv')['s'].values]
descom_df = pd.DataFrame({
        'description': descriptions,
        'community':communities
    })

### all_features: each show pair and all feature-group similarities

In [7]:
# create the all_features dataframe by
# melting each cosims dictionary and joining them together on the descriptions
all_features = pd.DataFrame()
for feature_group in feature_cosims_dict.keys():
    try:
        # Handle all cases but the first
        features = (feature_cosims_dict[feature_group]
                            .set_index(metascripts['description'])
                            .melt(var_name = 'description comparison', 
                                    value_name = f'{feature_group} similarity', 
                                    ignore_index = False)
                                .reset_index()
                                .rename(columns = {'description': 'description selection'})
                    )
        all_features = all_features.merge(features)
    except:
        # Handle the first case
        all_features = (feature_cosims_dict[feature_group]
                            .set_index(metascripts['description'])
                            .melt(var_name = 'description comparison', 
                                    value_name = f'{feature_group} similarity', 
                                    ignore_index = False)
                                .reset_index()
                                .rename(columns = {'description': 'description selection'})
                    )
# create the mean similarity column
all_features['mean similarity'] = all_features.mean(axis = 1)

### metacosims: all_features + metadata

In [8]:
# use all_features together with metascripts to create metacosims
metacols = ['description', 'link', 'script characters', 'id',
           'artist', 'title', 'fullTitle', 'year', 'image', 'releaseDate',
           'runtimeMins', 'runtimeStr', 'awards', 'genres',
           'companies', 'contentRating', 'imDbRating',
           'imDbRatingVotes', 'similars', 'languages']

metacosims = (all_features
                .merge(metascripts[metacols].drop(columns = 'similars'),
                       left_on = 'description selection',
                       right_on = 'description')
                .drop(columns = 'description')
                .merge(metascripts[metacols].drop(columns = 'similars'), 
                       left_on = 'description comparison', 
                       right_on = 'description',
                       suffixes = (" selection", " comparison"))
    )

# drop duplicate column
keepind = [ind for ind, col in enumerate(metacosims.columns) if ind != 1]
metacosims = metacosims.iloc[:, keepind]

In [9]:
# metacosims is sent to a sqlite db and pulled down into R for the Shiny dashboard

# with sqlite3.connect('../StandupRecommenderShiny/data/metacosims_recommender.sqlite') as db:
#      metacosims.to_sql('metacosims', db, if_exists = 'append', index = False)

In [116]:
with open('../data/metacosims_df.pickle', 'wb') as file:
    pickle.dump(metacosims, file)

### graph_all_features: all_features deduplicated for graph database upload

In [10]:
# create graph_all_features, a deduplicated all_features that also removes self-relations
# graph_all_features is used for Neo4J importation
graph_all_features = (all_features.loc[all_features['mean similarity'] < 0.99999]
                            .sort_values(['mean similarity', 'description selection', 'description comparison'], ascending = [False, True, True])
                            .drop_duplicates(subset = 'mean similarity', keep = 'first')
        )
graph_all_features.to_csv('../data/graph_df_all_features.csv', index = False)

### feature_mean_coms: feature groups with mean communities

In [72]:
# fix the indices of two feature-group dfs that were reset
# this is necessary for the upcoming concat
dfs_with_bad_indices = ['pos_props', 'pov_props']
for df in dfs_with_bad_indices:
    features_dict[df] = features_dict[df].set_index(features_dict['sur'].index)

In [76]:
feature_mean_coms = {group: pd.concat([df, metascripts['description']], axis = 1).merge(descom_df).rename(columns = {'community':'mean community'}) for group, df in features_dict.items()}
feature_mean_coms.keys()

dict_keys(['metadata', 'pos_props', 'pov_props', 'word_sentence_lengths', 'profanity', 'sentiment', 'sur'])

In [107]:
feature_mean_coms['pos_props'].head()

Unnamed: 0,VERB,PRON,INTJ,NOUN,ADV,AUX,ADJ,PART,ADP,DET,SCONJ,CCONJ,PROPN,NUM,description,mean community
0,0.14651,0.184963,0.017147,0.145698,0.062906,0.10623,0.053673,0.037845,0.081981,0.077922,0.024046,0.027902,0.025974,0.007204,Jim Gaffigan: Comedy Monster (2021) | Transcript,2
1,0.14002,0.206552,0.016381,0.133737,0.057669,0.106361,0.057556,0.039942,0.076517,0.074274,0.030405,0.029507,0.021317,0.008639,Louis C. K.: Sorry (2021) | Transcript,1
2,0.136785,0.216978,0.043468,0.119326,0.058519,0.116918,0.053341,0.040698,0.083925,0.060566,0.027694,0.026851,0.007586,0.007345,Drew Michael: Drew Michael (2018) | Transcript,0
3,0.140336,0.199783,0.022306,0.133947,0.055658,0.107959,0.052518,0.045479,0.087168,0.069193,0.026096,0.02902,0.021332,0.009204,Drew Michael: Red Blue Green (2021) | Transcript,0
4,0.134704,0.207911,0.027945,0.133917,0.061694,0.1043,0.051855,0.031388,0.085309,0.065729,0.01781,0.026567,0.041622,0.009151,Mo Amer: Mohammed in Texas (2021) | Transcript,1


### feature_group_coms: feature groups with group communities

In [80]:
import glob
communities_files = glob.glob("../data/feature_communities/*.csv")
com_file_names = [re.search("communities_(.+)\.csv", file).group(1) for file in communities_files]
com_file_names[1:]

['meta', 'pos_prop', 'pov_prop', 'profanity', 'sentiment', 'sur', 'wslength']

In [81]:
# take all communities except bow
keys_dict = dict(zip(com_file_names[1:], sorted(list(feature_mean_coms.keys()))))

In [82]:
# create dictionary of feature-group community labels
feature_group_coms = {keys_dict[re.search("communities_(.+)\.csv", file).group(1)]:pd.read_csv(file) for file in communities_files[1:]}

In [85]:
# create feature-group dfs with group community labels
feature_coms = {key: df.merge(feature_group_coms[key]) for key, df in feature_mean_coms.items()}

In [108]:
feature_coms['pos_props'].head()

Unnamed: 0,VERB,PRON,INTJ,NOUN,ADV,AUX,ADJ,PART,ADP,DET,SCONJ,CCONJ,PROPN,NUM,description,mean community,pos_propCommunity
0,0.14651,0.184963,0.017147,0.145698,0.062906,0.10623,0.053673,0.037845,0.081981,0.077922,0.024046,0.027902,0.025974,0.007204,Jim Gaffigan: Comedy Monster (2021) | Transcript,2,54
1,0.14002,0.206552,0.016381,0.133737,0.057669,0.106361,0.057556,0.039942,0.076517,0.074274,0.030405,0.029507,0.021317,0.008639,Louis C. K.: Sorry (2021) | Transcript,1,45
2,0.136785,0.216978,0.043468,0.119326,0.058519,0.116918,0.053341,0.040698,0.083925,0.060566,0.027694,0.026851,0.007586,0.007345,Drew Michael: Drew Michael (2018) | Transcript,0,68
3,0.140336,0.199783,0.022306,0.133947,0.055658,0.107959,0.052518,0.045479,0.087168,0.069193,0.026096,0.02902,0.021332,0.009204,Drew Michael: Red Blue Green (2021) | Transcript,0,113
4,0.134704,0.207911,0.027945,0.133917,0.061694,0.1043,0.051855,0.031388,0.085309,0.065729,0.01781,0.026567,0.041622,0.009151,Mo Amer: Mohammed in Texas (2021) | Transcript,1,241


In [111]:
with open('../data/feature_communities/feature_coms_dict.pickle', 'wb') as file:
    pickle.dump(feature_coms, file)

### all_coms: all communities for each show

In [109]:
def merge_dfs(starting_df, dict_of_dfs):
    """Merge several dataframes together given a starting df and a dictionary of dataframes"""
    for key, df in dict_of_dfs.items():
        starting_df = starting_df.merge(df)
    merged_df = starting_df
    return merged_df

all_coms = merge_dfs(descom_df, feature_group_coms)
all_coms.head()

Unnamed: 0,description,community,metaCommunity,pos_propCommunity,pov_propCommunity,profanityCommunity,sentimentCommunity,surCommunity,wslengthCommunity
0,AMY SCHUMER: THE LEATHER SPECIAL (2017) – Full...,0,33,269,21,288,159,80,51
1,MICHELLE WOLF: NICE LADY (2017) – Full Transcript,0,148,93,244,288,159,80,51
2,Joe Rogan: Strange Times (2018) – Full Transcript,1,51,51,281,192,109,274,51
3,Eddie Izzard – Glorious (1997) – Transcript,2,238,68,28,192,9,274,242
4,JIM JEFFERIES: FREEDUMB (2016) – Full Transcript,1,148,37,15,192,109,274,253


In [112]:
with open('../data/feature_communities/all_coms_df.pickle', 'wb') as file:
    pickle.dump(all_coms, file)

# appendix

In [None]:
# manually explore the shows in each community

def check_community_shows(com_index, show_range):
    """Return an ordered subset of shows in a selected community."""
    community_list = descom_df['community'].unique()
    com = community_list[com_index]
    sorted_subset = descom_df[descom_df['community'] == com].sort_values('description')[show_range]
    return sorted_subset

check_community_shows(com_index = 0, show_range = 40:60)

In [None]:
comedians_between = (metascripts.merge(descom_df)
                     .groupby('artist')
                     ['community']
                     .nunique() > 1).sum()

comedians_multiple = (metascripts.groupby('artist')['fullTitle'].count() > 1).sum()
print(f'{comedians_between} out of {comedians_multiple}, i.e, {round(100*comedians_between/comedians_multiple,2)}%, of comedians with multiple shows in the dataset had shows in different communities.')