In [4]:
import json
import pandas as pd

input_path = "/mnt/c/Users/Topcu/Desktop/240416_network/merged_network_predictions_users.jsonl"
output_csv_path = "/mnt/c/Users/Topcu/Desktop/gephi_retweet_network_data_single_category.csv"

main_categories = {
    'interest rates': 'Interest Rates',
    'tax': 'Tax',
    'foreign exchange market': 'Foreign Exchange Market',
    'inflation and prices': 'Inflation and Prices',
    'raise increases': 'Raise Increases',
    'minimum wage and wages': 'Minimum Wage and Wages',
    'retirement pensions': 'Retirement Pensions',
    'unemployment': 'Unemployment',
}

main_categories_blame_type = {
    "presidents' achievements": "Presidents' Achievements",
    "presidents' failures": "Presidents' Failures",
    'blaming external actors': 'Blaming External Actors',
  
}

def process_entries(entries, categories):
    if not entries:
        return []
    processed_entries = [categories.get(entry.strip().lower()) for entry in entries if entry.strip().lower() in categories]
    return list(filter(None, processed_entries))

def prepare_for_gephi(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            tweet = json.loads(line)
            if tweet['tweet_type'] == 'retweet' and tweet['ref_user_id']:
                module_1 = tweet.get('module_1') or []
                module_2 = tweet.get('module_2') or []
                
                modules = process_entries(module_1 + module_2, main_categories)
                modules = [m for m in modules if m != 'no category']
                
                if len(modules) != 1:
                    continue

                blame_types = process_entries(tweet.get('blame_type', []), main_categories_blame_type)
                if not blame_types or len(blame_types) > 1:
                    continue

                data.append({
                    'Source': tweet['user_id'],
                    'Target': tweet['ref_user_id'],
                    'Module': modules[0],
                    'Blame_Type': ", ".join(blame_types),
                    'Ideology': tweet.get('user_ideology', 'Unknown')
                })

    df = pd.DataFrame(data)
    return df

df = prepare_for_gephi(input_path)
df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

print("Columns in DataFrame:", df.columns)
print("DataFrame shape:", df.shape)
print(df.head())

Columns in DataFrame: Index(['Source', 'Target', 'Module', 'Blame_Type', 'Ideology'], dtype='object')
DataFrame shape: (18732, 5)
                Source               Target                   Module  \
0            256116639             68034431                      Tax   
1            271035921           2298453350  Foreign Exchange Market   
2            433673143  1488614157071097861  Foreign Exchange Market   
3  1362156660677083145             68034431      Retirement Pensions   
4             96767975  1126745319901765633     Inflation and Prices   

                 Blame_Type             Ideology  
0      Presidents' Failures  turkish_nationalism  
1      Presidents' Failures  turkish_nationalism  
2   Blaming External Actors  turkish_nationalism  
3  Presidents' Achievements  turkish_nationalism  
4  Presidents' Achievements           liberalism  


In [5]:
import networkx as nx
from community import community_louvain

def process_entries(entries, main_categories):
    return [entry for entry in entries if entry in main_categories]

def create_network(df):
    G = nx.DiGraph()
    for _, row in df.iterrows():
        G.add_edge(str(row['Source']), str(row['Target']), module=row['Module'], blame_type=row['Blame_Type'], ideology=row['Ideology'])
        G.nodes[str(row['Source'])]['ideology'] = row['Ideology']
        G.nodes[str(row['Source'])]['module'] = row['Module']
        G.nodes[str(row['Source'])]['blame_type'] = row['Blame_Type']
    return G

def detect_communities(G):
    partition = community_louvain.best_partition(G.to_undirected())
    return partition

def create_dataframe(G, partition):
    nx.set_node_attributes(G, partition, 'community')
    data = [{'node': node, 'community': data['community'], 'blame_type': data.get('blame_type', 'No Blame'), 'ideology': data.get('ideology', 'Unknown')}
            for node, data in G.nodes(data=True)]
    return pd.DataFrame(data)

def prepare_for_gephi(file_path):
    total_count = 0
    retweet_count = 0
    ref_user_id_count = 0
    valid_module_count = 0
    valid_blame_type_count = 0
    data = []
    missing_ref_user_id = []

    with open(file_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            total_count += 1
            tweet = json.loads(line)

            if tweet['tweet_type'] == 'retweet':
                retweet_count += 1
                if tweet['ref_user_id']:
                    ref_user_id_count += 1
                    module_1 = tweet.get('module_1') or []
                    module_2 = tweet.get('module_2') or []
                    
                    modules = process_entries(module_1 + module_2, main_categories)
                    modules = [m for m in modules if m != 'no category']
                    
                    if len(modules) == 1:
                        valid_module_count += 1
                        blame_types = process_entries(tweet.get('blame_type', []), main_categories_blame_type)
                        
                        if not blame_types or len(blame_types) > 1:
                            continue
                        valid_blame_type_count += 1

                        data.append({
                            'Source': tweet['user_id'],
                            'Target': tweet['ref_user_id'],
                            'Module': modules[0],
                            'Blame_Type': ", ".join(blame_types),
                            'Ideology': tweet.get('user_ideology', 'Unknown')
                        })
                else:
                    missing_ref_user_id.append(tweet)

    print(f"Total tweets: {total_count}")
    print(f"Retweets: {retweet_count}")
    print(f"With reference user ID: {ref_user_id_count}")
    print(f"With valid module: {valid_module_count}")
    print(f"With valid blame type: {valid_blame_type_count}")
    print(f"Tweets with missing ref_user_id: {len(missing_ref_user_id)}")


    with open("/mnt/c/Users/Topcu/Desktop/missing_ref_user_id.json", 'w', encoding='utf-8') as outfile:
        json.dump(missing_ref_user_id, outfile, ensure_ascii=False, indent=4)

    df = pd.DataFrame(data)
    return df

if __name__ == "__main__":
 
    input_path = "/mnt/c/Users/Topcu/Desktop/240416_network/merged_network_predictions_users.jsonl"
    output_csv_path = "/mnt/c/Users/Topcu/Desktop/gephi_retweet_network_data_single_category.csv"

    df = prepare_for_gephi(input_path)
    df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

    print("Columns in DataFrame:", df.columns)
    print("DataFrame shape:", df.shape)
    print(df.head())

    G = create_network(df)

    print(f"Number of nodes: {G.number_of_nodes()}")
    print(f"Number of edges: {G.number_of_edges()}")

    isolated_nodes = [node for node in G.nodes if G.degree(node) == 0]
    print(f"Number of isolated nodes: {len(isolated_nodes)}")

    partition = detect_communities(G)
    df_with_communities = create_dataframe(G, partition)

    output_json_path = "/mnt/c/Users/Topcu/Desktop/network_metrics.json"
    metrics = {
        'modularity': community_louvain.modularity(partition, G.to_undirected()),
        'graph_density': nx.density(G),
        'number_of_nodes': G.number_of_nodes(),
        'number_of_edges': G.number_of_edges(),
        'average_clustering': nx.average_clustering(G.to_undirected()),
        'average_shortest_path_length': nx.average_shortest_path_length(G.to_undirected()) if nx.is_connected(G.to_undirected()) else None,
        'diameter': nx.diameter(G.to_undirected()) if nx.is_connected(G.to_undirected()) else None
    }
    with open(output_json_path, 'w') as f:
        json.dump(metrics, f, indent=4)

    print(f"Modularity: {metrics['modularity']}")
    print(f"Graph Density: {metrics['graph_density']}")
    print(f"Average Clustering Coefficient: {metrics['average_clustering']}")
    print(f"Average Shortest Path Length: {metrics['average_shortest_path_length']}")
    print(f"Diameter: {metrics['diameter']}")
    
    assortativity_ideology = nx.attribute_assortativity_coefficient(G, 'ideology')
    print(f"Assortativity based on ideology: {assortativity_ideology}")
    assortativity_blame_type = nx.attribute_assortativity_coefficient(G, 'blame_type')
    print(f"Assortativity based on blame type: {assortativity_blame_type}")
    assortativity_module = nx.attribute_assortativity_coefficient(G, 'module')
    print(f"Assortativity based on module: {assortativity_module}")


Total tweets: 257941
Retweets: 95037
With reference user ID: 94990
With valid module: 40864
With valid blame type: 18732
Tweets with missing ref_user_id: 47
Columns in DataFrame: Index(['Source', 'Target', 'Module', 'Blame_Type', 'Ideology'], dtype='object')
DataFrame shape: (18732, 5)
                Source               Target                   Module  \
0            256116639             68034431                      tax   
1            271035921           2298453350  foreign exchange market   
2            433673143  1488614157071097861  foreign exchange market   
3  1362156660677083145             68034431      retirement pensions   
4             96767975  1126745319901765633     inflation and prices   

                 Blame_Type             Ideology  
0      presidents' failures  turkish_nationalism  
1      presidents' failures  turkish_nationalism  
2   blaming external actors  turkish_nationalism  
3  presidents' achievements  turkish_nationalism  
4  presidents' achievemen

In [3]:
unique_sources = df['Source'].nunique()
unique_targets = df['Target'].nunique()
print(f"Unique Sources: {unique_sources}")
print(f"Unique Targets: {unique_targets}")
retweets_per_user = df.groupby('Source').size()
multiple_retweets = retweets_per_user[retweets_per_user > 1].count()
print(f"Users who retweeted multiple different users: {multiple_retweets}")
retweeted_by_multiple = df.groupby('Target').size()
multiple_retweeted = retweeted_by_multiple[retweeted_by_multiple > 1].count()
print(f"Users who were retweeted by multiple different users: {multiple_retweeted}")

Unique Sources: 15902
Unique Targets: 4204
Users who retweeted multiple different users: 2058
Users who were retweeted by multiple different users: 1635
