In [1]:
!pip install --user networkx[default]



In [222]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
cip_soc = pd.read_csv("CIP_SOC.csv", encoding='cp1252')
cip_soc_matches = cip_soc[cip_soc['CIP2020Title'] != "NO MATCH"]
cip_soc_matches = cip_soc_matches.astype(str)
cip_soc_matches

Unnamed: 0,CIP2020Code,CIP2020Title,SOC2018Code,SOC2018Title
0,1.0,"Agriculture, General.",19-1011,Animal Scientists
1,1.0,"Agriculture, General.",19-1012,Food Scientists and Technologists
2,1.0,"Agriculture, General.",19-1013,Soil and Plant Scientists
3,1.0,"Agriculture, General.",19-4012,Agricultural Technicians
4,1.0,"Agriculture, General.",25-1041,"Agricultural Sciences Teachers, Postsecondary"
...,...,...,...,...
5912,61.2899,"Urology Residency/Fellowship Programs, Other.",29-1249,"Surgeons, All Other"
5913,61.9999,"Medical Residency/Fellowship Programs, Other.",19-1042,"Medical Scientists, Except Epidemiologists"
5914,61.9999,"Medical Residency/Fellowship Programs, Other.",25-1071,"Health Specialties Teachers, Postsecondary"
5915,61.9999,"Medical Residency/Fellowship Programs, Other.",29-1229,"Physicians, All Other"


In [3]:
cip_soc_np = cip_soc_matches.to_numpy()
match_edges = [(str(row[0]), str(row[2])) for row in cip_soc_np]
len(match_edges)

5917

In [7]:
G = nx.Graph()
G.add_edges_from(match_edges)
    

In [113]:
#nx.draw(G) # Not helpful

In [286]:
from networkx.algorithms.community import greedy_modularity_communities, louvain
communities = louvain.louvain_communities(G, resolution=15) #better split than grreedy


In [287]:
help(louvain)

Help on module networkx.algorithms.community.louvain in networkx.algorithms.community:

NAME
    networkx.algorithms.community.louvain

DESCRIPTION
    Function for detecting communities based on Louvain Community Detection
    Algorithm

FUNCTIONS
    louvain_communities(G, weight='weight', resolution=1, threshold=1e-07, seed=None)
        Find the best partition of a graph using the Louvain Community Detection
        Algorithm.
        
        Louvain Community Detection Algorithm is a simple method to extract the community
        structure of a network. This is a heuristic method based on modularity optimization. [1]_
        
        The algorithm works in 2 steps. On the first step it assigns every node to be
        in its own community and then for each node it tries to find the maximum positive
        modularity gain by moving each node to all of its neighbor communities. If no positive
        gain is achieved the node remains in its original community.
        
        Th

In [288]:
len(communities)

190

In [289]:
for community in communities:
    print(len(community))

11
37
16
13
16
18
195
4
36
26
4
18
3
4
22
17
24
25
15
6
55
65
10
6
30
25
19
14
23
7
20
8
20
16
11
2
16
3
15
11
2
22
27
23
24
9
22
5
14
9
30
12
62
24
15
14
10
30
2
23
10
34
2
10
12
2
39
11
9
6
31
11
5
9
25
11
6
19
15
21
36
10
3
6
17
20
16
2
45
17
13
27
12
36
2
23
39
7
6
2
2
2
6
2
2
2
2
3
26
2
2
12
2
4
3
3
6
9
2
2
6
35
10
42
2
7
22
3
3
4
2
2
2
2
9
18
2
2
2
2
6
3
2
23
2
23
2
2
2
2
6
10
8
10
18
9
19
45
19
4
25
12
2
2
4
34
11
2
2
12
9
3
13
34
19
5
79
17
2
15
15
26
29
3
15
15
8
5
16
18


In [277]:
#First make sure no single codes are left without community
import random
n = random.randint(0, len(communities)-1)
for code in communities[n]:
    name = None
    code_type ="CIP"
    if "." in code:
        aoi = cip_soc[cip_soc['CIP2020Code'].astype(str) == str(code)]['CIP2020Title'].reset_index(drop=True)
        name = aoi[0]
    elif "-" in code:
        aoi =  cip_soc[cip_soc['SOC2018Code'] == code]['SOC2018Title'].reset_index(drop=True)
        name = aoi[0]
        code_type = "SOC"
    print(code, code_type, name)

9.0499 CIP Journalism, Other.
13-1131 SOC Fundraisers
9.0903 CIP Advertising.
9.0406 CIP Cultural Journalism.
9.0701 CIP Radio and Television.
9.0904 CIP Political Communication.
9.01 CIP Communication, General.
27-3031 SOC Public Relations Specialists
11-2033 SOC Fundraising Managers
9.0102 CIP Mass Communication/Media Studies.
9.09 CIP Public Relations, Advertising, and Applied Communication.
11-2032 SOC Public Relations Managers
9.0402 CIP Broadcast Journalism.
25-1122 SOC Communications Teachers, Postsecondary
9.0907 CIP International and Intercultural Communication.
9.0101 CIP Speech Communication and Rhetoric.
9.0905 CIP Health Communication.
27-3011 SOC Broadcast Announcers and Radio Disc Jockeys
52.0501 CIP Business/Corporate Communications, General.
9.0902 CIP Public Relations/Image Management.
11-2011 SOC Advertising and Promotions Managers
27-3023 SOC News Analysts, Reporters, and Journalists
9.0909 CIP Communication Management and Strategic Communications.
52.0502 CIP Grant

In [296]:
rows = []
for idx in range(len(communities)):
    cips = []
    socs = []
    for code in communities[idx]:
        name = None
        code_type ="CIP"
        if "." in code:
            aoi = cip_soc[cip_soc['CIP2020Code'].astype(str) == str(code)]['CIP2020Title'].reset_index(drop=True)
            name = aoi[0]
            cips.append({"code": code, "name": name})
        elif "-" in code:
            aoi =  cip_soc[cip_soc['SOC2018Code'] == code]['SOC2018Title'].reset_index(drop=True)
            name = aoi[0]
            code_type = "SOC"
            socs.append({"code": code, "name": name})
    rows.append({"idx":idx, "cips":cips, "socs":socs})

In [297]:
rows[:5]

[{'idx': 0,
  'cips': [{'code': '26.0301', 'name': 'Botany/Plant Biology.'},
   {'code': '26.0399', 'name': 'Botany/Plant Biology, Other.'},
   {'code': '1.0', 'name': 'Agriculture, General.'},
   {'code': '1.1199', 'name': 'Plant Sciences, Other.'},
   {'code': '1.1104',
    'name': 'Agricultural and Horticultural Plant Breeding.'},
   {'code': '1.1202', 'name': 'Soil Chemistry and Physics.'},
   {'code': '1.1299', 'name': 'Soil Sciences, Other.'},
   {'code': '26.0308', 'name': 'Plant Molecular Biology.'},
   {'code': '26.0307', 'name': 'Plant Physiology.'},
   {'code': '26.0305', 'name': 'Plant Pathology/Phytopathology.'}],
  'socs': [{'code': '19-1013', 'name': 'Soil and Plant Scientists'}]},
 {'idx': 1,
  'cips': [{'code': '1.1105',
    'name': 'Plant Protection and Integrated Pest Management.'},
   {'code': '1.0101',
    'name': 'Agricultural Business and Management, General.'},
   {'code': '1.0304', 'name': 'Crop Production.'},
   {'code': '1.0308', 'name': 'Agroecology and Sust

In [298]:
df = pd.DataFrame(rows)

In [302]:
import json 
file_path = "uoa.json"
with open(file_path, "w") as json_file:
    json.dump(rows, json_file, indent=4)