In [30]:
import pandas as pd
import json
from Main_community_detection import read_csv_and_return_variables
from Community_visualization_caract import reverse_label_encoding
import sys
sys.path.append('IMI_project_2023-1/')

In [11]:
def filter_features(community_features, exclusion_set):
    return [feature for feature in community_features if feature not in exclusion_set]

In [8]:
file_path = 'Community_top4_features.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [11]:
exclusion_set = [
    "monde UPDATE", "suisse UPDATE", "monde ANALYSIS", "suisse ANALYSIS", 
    "economie UPDATE", "sciences-tech ANALYSIS", "sciences-tech UPDATE", 
    "economie ANALYSIS", "culture UPDATE", "sciences-tech EDUCATE", 
    "culture DIVERT", "monde EDUCATE", "suisse EDUCATE", "economie EDUCATE", 
    "suisse TREND", "culture ANALYSIS"
]

In [9]:
exclusion_set = []

In [12]:
filtered_communities = {community: filter_features(features, exclusion_set) for community, features in data.items()}

filtered_df = pd.DataFrame(dict([(k, pd.Series(v)) for k,v in filtered_communities.items()]))

In [13]:
filtered_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,culture INSPIRE,sciences-tech INSPIRE,culture INSPIRE,economie TREND,sciences-tech TREND,sciences-tech DIVERT,monde DIVERT,suisse DIVERT,culture EDUCATE,monde TREND,monde INSPIRE,culture TREND,monde DIVERT,culture INSPIRE,sport UPDATE,culture INSPIRE
1,suisse INSPIRE,monde DIVERT,culture EDUCATE,monde DIVERT,monde DIVERT,monde DIVERT,monde TREND,monde DIVERT,monde DIVERT,monde DIVERT,monde DIVERT,monde DIVERT,monde TREND,monde DIVERT,culture INSPIRE,economie INSPIRE
2,monde DIVERT,monde TREND,monde TREND,monde TREND,monde TREND,monde TREND,culture INSPIRE,culture EDUCATE,monde TREND,culture INSPIRE,monde TREND,monde TREND,culture INSPIRE,monde TREND,suisse INSPIRE,monde DIVERT
3,culture EDUCATE,culture INSPIRE,sciences-tech INSPIRE,culture INSPIRE,culture INSPIRE,sciences-tech INSPIRE,sciences-tech INSPIRE,sciences-tech INSPIRE,culture INSPIRE,culture EDUCATE,culture INSPIRE,culture EDUCATE,sciences-tech TREND,culture EDUCATE,culture EDUCATE,culture EDUCATE


In [14]:
df5 = pd.DataFrame(dict([(k, pd.Series(v[:4])) for k, v in data.items()]))
df5_transposed = df5.transpose()

latex_table = df5_transposed.to_latex(header=False, index=True)

In [15]:
with open('community_features_table.tex', 'w') as tex_file:
    tex_file.write(latex_table)

In [34]:
def extract_combinations_for_communities(leiden_membership, df_rts_temp, user_ip_encoder, combination_encoder):
    communities_combinations = {}
    leiden_membership = pd.Series(leiden_membership)
    
    for community_id in set(leiden_membership):
        # Find the users in this community
        community_nodes = df_rts_temp[df_rts_temp['User_IP_Code'].isin(leiden_membership[leiden_membership == community_id].index)]['User_IP_Code']
        
        community_combinations = df_rts_temp[df_rts_temp['User_IP_Code'].isin(community_nodes)][['Combination_Code', 'Connections']]
    
    # Sum the connections for each combination
        summed_combinations = community_combinations.groupby('Combination_Code')['Connections'].sum().reset_index()
    
    # Sort by summed connections and get the top 4 combinations
        top_combinations = summed_combinations.sort_values(by='Connections', ascending=False)['Combination_Code']
    
    # Decode combinations
        original_combinations_in_community = reverse_label_encoding(top_combinations, combination_encoder)
    
    # Save the top 4 combinations for this community
        communities_combinations[community_id] = original_combinations_in_community.tolist()

    output_file = 'Community_all_features.json'
    with open(output_file, 'w') as f:
        json.dump(communities_combinations, f,indent = 4)
    print(f"All combinations for each community saved to {output_file}")
    
    return communities_combinations

In [43]:
df_rts, ips, combos, user_ip_encoder, combination_encoder,_ = read_csv_and_return_variables(100000)
print("Loading Leiden communities from file...")
leiden_partition = pd.read_csv('Community_detection/leiden_communities.csv')
leiden_membership = leiden_partition['community_id'].values
print(f"Leiden communities: {len(set(leiden_membership))}")
    
    # Extract combinations for each community
communities_combinations = extract_combinations_for_communities(leiden_membership, df_rts, user_ip_encoder, combination_encoder)


Lecture du CSV...
Combination
monde UPDATE              18902
suisse UPDATE             18861
monde ANALYSIS            18607
suisse ANALYSIS           18598
economie UPDATE           17720
sciences-tech ANALYSIS    17309
sciences-tech UPDATE      16938
economie ANALYSIS         16843
culture UPDATE            16447
sciences-tech EDUCATE     15039
culture DIVERT            14817
monde EDUCATE             14761
suisse EDUCATE            13699
economie EDUCATE          10760
suisse TREND              10295
culture ANALYSIS           8639
monde DIVERT               6875
monde TREND                6196
culture INSPIRE            5750
culture EDUCATE            4941
sciences-tech INSPIRE      4537
sciences-tech TREND        3256
suisse DIVERT              3186
suisse INSPIRE             2749
monde INSPIRE              2379
economie TREND             2083
sciences-tech DIVERT       1898
culture TREND              1507
sport UPDATE                319
sport INSPIRE                48
economie I

In [44]:
file_path = 'Community_all_features.json'

with open(file_path, 'r') as file:
    data = json.load(file)

In [41]:
def find_lowest_index_analysis(data):
    first_occurrences = {}

    for key, values in data.items():
        for index, value in enumerate(values):
            if "ANALYSIS" in value:
                topic = value.split()[0]  # Extract the topic (first word)
                
                # Check if this topic has been found before or if the index is lower
                if topic not in first_occurrences or index < first_occurrences[topic]['index']:
                    first_occurrences[topic] = {
                        'community': key,  # Store the community (element key)
                        'index': index,  # Store the index of the occurrence
                        'topic_analysis': value  # Store the "topic ANALYSIS" string
                    }
    
    return first_occurrences

In [46]:
lowest_index_communities = find_lowest_index_analysis(data)

# Output the results
for topic, info in lowest_index_communities.items():
    print(f"First occurrence of '{info['topic_analysis']}' is in community {info['community']} at index {info['index']}")

First occurrence of 'monde ANALYSIS' is in community 0 at index 2
First occurrence of 'suisse ANALYSIS' is in community 15 at index 2
First occurrence of 'sciences-tech ANALYSIS' is in community 4 at index 5
First occurrence of 'economie ANALYSIS' is in community 15 at index 5
First occurrence of 'culture ANALYSIS' is in community 0 at index 15
First occurrence of 'sport ANALYSIS' is in community 10 at index 29
