In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import networkx as nx
import os
import sys
sys.path.append('..')
from py_files.michele_algo import *

# Import Data

In [None]:
def inspect_network(folder):
    print(folder)
    creation_date = str(folder.split('/')[-3])
    df = pd.DataFrame(columns = ['name', 'scrape_date', '#nodes', '#edges', 'average_shortest_path_length', 'Generalized_euclidean', 'Our_polarity_score'])
    for file in os.listdir(folder):
        graph = nx.read_gexf(folder+file)
        graph = nx.to_undirected(graph)
        S = [graph.subgraph(c).copy() for c in sorted(nx.connected_components(graph), key=len, reverse=True)]
        graph = S[0]
        name = file[:-5]

        attr_dict = {
            'name' : name,
            'scrape_date' : creation_date, 
            '#nodes' : len(graph.nodes()),
            '#edges' : len(graph.edges()),
            'average_shortest_path_length' : nx.average_shortest_path_length(graph),
            'Generalized_euclidean' : do_ge(graph),
            'Our_polarity_score' : driver_g(graph) 
        }
        df = df.append(attr_dict, ignore_index = True)
    return df

In [None]:
main_dir = '../data/date_folders/'
First = True
for folder in os.listdir(main_dir):
    print(main_dir+folder+'/graphs/')
    if First:
        df = inspect_network(main_dir+folder+'/graphs/')
        First = False
    else:
        try:
            df = inspect_network(main_dir+folder+'/graphs/').append(df)
        except:
            pass

In [None]:
#df = df.set_index('name')

In [None]:
df

In [None]:
sns.pairplot(df, 
             #hue='name'
             )

# Barplots of main measures

In [None]:
# general seaborn settings
import matplotlib.pyplot as plt

sns.set_style('darkgrid')
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})

In [None]:
# generalized euclidean (michele measure 1)
df_GE = df.copy().sort_values('Generalized_euclidean')
df_GE = df_GE[df_GE['name'].isin([
    'PoliticalDiscussion',
    'Republican',
    'communism',
    'democrats',
    'ukpolitics',
    'politics',
    'worldnews',
    'News',
    'antiwork'
])]
bar_GE = sns.barplot(y=df_GE['name'], x=df_GE['Generalized_euclidean'], color='tab:blue')
bar_GE.set(xlabel='Generalized Euclidean Measure',
           ylabel='Subreddit',
           title='GE score of subreddit networks')
fig = bar_GE.get_figure()
fig.savefig('../data/plots/bar_GE.png', bbox_inches='tight')

In [None]:
# Correlation between ideological distance and comment sentiment
df_correlation = df_GE.copy().sort_values('Our_polarity_score')
bar_correlation = sns.barplot(x=df_correlation['Our_polarity_score'], y=df_correlation['name'], color='tab:blue')
bar_correlation.set(xlabel='Correlation Score',
                    ylabel='Subreddit',
                    title='Correlation score between ideological distance and comment sentiment')
fig = bar_correlation.get_figure()
fig.savefig('../data/plots/bar_correlation.png', bbox_inches='tight')

### New plot

In [None]:
df_GE.rename(columns={'name':'Subreddit'}, inplace = True)
scatter = sns.scatterplot(data=df_GE, x='Generalized_euclidean', y='Our_polarity_score', hue='Subreddit')
plt.axhline(y=0, color='black', ls='dashed')
sns.move_legend(scatter, "upper left", bbox_to_anchor=(1, 1), frameon=False)
# plt.axvline(x=60, color='black', ls='dashed')
scatter.set(title='GE Score vs Correlation Score',
            xlim=(20,100),
            ylim=(-0.4,0.4))
scatter.set_xlabel('GE Score', size=11)
scatter.set_ylabel('Correlation Score', size=11)
fig = scatter.get_figure()
fig.savefig('../data/plots/scatter.png', bbox_inches='tight')

# Calculate stats per Network

# Check Correlations

# lineplots?