Link to GitHub repository [here](https://github.com/jesp9435/ComSocSci)

Group member contributions: Both group members contributed equally to the parts of the assignment. We have worked collaboratively on all parts. 

# Part 1: Properties of the real-world network of Computational Social Scientists

In [55]:
# Relevant imports:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv
import networkx as nx
import netwulf as nw
import numpy as np
import pickle

In [None]:
# These numbers were found in Assignment 1
amount_of_nodes = 14196
amount_of_edges = 54304

# We use equation 3.2 from the book and solve for p:
probability = amount_of_edges/((amount_of_nodes*(amount_of_nodes-1))/2)
print("Probability: " + str(probability))

# The average degree can be found from equation 3.3:
k = probability*(amount_of_nodes-1.0)
print("Average degree: " + str(k))

# Generating a Random Network with node count mirroring Computational Social Scientists network
# using our calculated probability:
RG = nx.gnp_random_graph(amount_of_nodes, probability, seed=1000, directed=False)
#nw.interactive.visualize(RG)

Answer the following questions (max 200 words in total):

What regime does your random network fall into? Is it above or below the critical threshold?
According to the textbook, what does the network's structure resemble in this regime?
Based on your visualizations, identify the key differences between the actual and the random networks. Explain whether these differences are consistent with theoretical expectations.

In [None]:
# Computing distribution of degree for the random network:
degrees_random = [degree for node, degree in RG.degree()]
hist, bins = np.histogram(degrees_random, bins=20, density=True)

# Plot the degree distribution
plt.figure(figsize=(6, 4))
plt.hist(degrees_random, bins=bins, density=True, alpha=0.7, edgecolor = "black")
plt.xlabel('Degree')
plt.xlim(0,20)
plt.ylabel('Frequency')
plt.title('Degree Distribution of Random Network')
plt.grid(True)
plt.show()

# Accessing the network from Assignment 1:
with open("my_network.pickle", "rb") as f:
    G = pickle.load(f)

# Computing distribution of degree for the Social network:
degrees_social = [degree for node, degree in G.degree()]
hist, bins = np.histogram(degrees_social, bins=200, density=False)

# Plot the degree distribution
plt.figure(figsize=(6, 4))
plt.hist(degrees_social, bins=bins, density=True, alpha=0.7, edgecolor = "black")
plt.xlabel('Degree')
plt.xlim(0,40)
plt.ylabel('Frequency')
plt.title('Degree Distribution of Social Network')
plt.grid(True)
plt.show()




In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pickle

# Function to generate a random network
def generate_random_network(node_count, p):
    G = nx.Graph()
    G.add_nodes_from(range(node_count))
    for i in range(node_count):
        for j in range(i + 1, node_count):  
            if np.random.uniform(0, 1) < p:
                G.add_edge(i, j)
    return G

# Parameters for the social network
amount_of_nodes = 14196
amount_of_edges = 54304
probability = amount_of_edges / ((amount_of_nodes * (amount_of_nodes - 1)) / 2)

# Generate random network
RG = nx.gnp_random_graph(amount_of_nodes, probability, seed=1000, directed=False)
degrees_random = [degree for node, degree in RG.degree()]

# Load social network
with open("my_network.pickle", "rb") as f:
    G = pickle.load(f)
degrees_social = [degree for node, degree in G.degree()]

# Compute average degree for random and social networks
avg_degree_random = np.mean(degrees_random)
avg_degree_social = np.mean(degrees_social)

# Plot both degree distributions
plt.figure(figsize=(10, 6))
plt.plot(sorted(degrees_random), label='Random Network')
plt.plot(sorted(degrees_social), label='Social Network')
plt.xlabel('Node Index (Sorted by Degree)')
plt.ylabel('Degree')
plt.title('Degree Distribution Comparison')
plt.grid(True)
plt.legend()

# Add vertical lines for average degrees
plt.axhline(y=avg_degree_random, color='r', linestyle='--', label=f'Average Degree Random: {avg_degree_random:.2f}')
plt.axhline(y=avg_degree_social, color='g', linestyle='--', label=f'Average Degree Social: {avg_degree_social:.2f}')

plt.xlim(0, len(degrees_random))  # Adjust x-axis limits

plt.legend()
plt.show()


# Part 2: Network Analysis in Computational Social Science

In [None]:
# Part 1: Assortativity Coefficient
# This function calculates the Assortativity Coefficient for the network based on the country of each node:
def assortativity_coefficient(G, attribute):
    m = G.number_of_edges()
    k = sum(dict(G.degree()).values())

    e_i = 0
    a_i = 0

    for u, v in G.edges():
        if G.nodes[u][attribute] == G.nodes[v][attribute]:
            e_i += G.degree[u] * G.degree[v]
        a_i += G.degree[u] + G.degree[v]

    r = (e_i / m - (0.5 * a_i / m)**2) / (0.5 * a_i / m - (0.5 * a_i / m)**2)
    return r

# Using the function to calculate the Assortativity Coefficient:
assortativity = assortativity_coefficient(G, 'country_code')
print("Assortativity coefficient based on country attribute:", assortativity)


# Part 3: Words that characterize Computational Social Science communities

##### Exercise 1: TF-IDF and the Computational Social Science communities

* TF-IDF is a statistical measure used in natural language processing and information retrieval to evaluate the importance of a word in a document within a larger collection of documents. The main purpose of TF-IDF is to highlight words that are significant to a specific document while downplaying words that are common across many documents.
* TF stands for Term Frequency and is simply, as the name suggests, a calculation of the frequency of a term in a specific document. 
* IDF stands for Inverse Document Frequency and is a measure of a terms's rarity across all documents in the collection.