<style>
div.output_area pre {
    margin: 0;
    padding: 1px 0 1px 0;
    border: 0;
    vertical-align: baseline;
    color: black;
    background-color: transparent;
    border-radius: 0;
    height: auto;
    max-height: 350px;
    overflow: auto;
}
</style>

# 資料變項說明

[資料下載](https://minhaskamal.github.io/DownGit/#/home?url=https://github.com/liao961120/collegeSNA/blob/master/network_data)

#### [`ntuNetwork_edges.csv`](https://github.com/liao961120/collegeSNA/blob/master/network_data/ntuNetwork_edges.csv)
1. `node1`, `node2`: 臺大各校系的 ID
1. `edgeWeight`: 同時申請 `node1` 和 `node2` 的數量


#### [`ntuNetwork_attr.csv`](https://github.com/liao961120/collegeSNA/blob/master/network_data/ntuNetwork_attr.csv)

1. `id`: 對應到 `ntuNetwork_edges.csv` 中的 `node1`, `node2`
1. `department`: 臺灣大學各系
1. `recruit_quota`: 招生名額 (外加名額已併入)
1. `applied_region`: 申請者來自的地區 (依准考證號碼)。這個變項我沒有再繼續處理下去 (如果有分析上的需要，可以用 R 處理成可用的資訊?)


In [1]:
import json
with open('college_network/ntuNetwork.json', 'rb') as f:
    data = json.load(f)

## Data Cleaning

In [2]:
for i in range(len(data)):
    data[i]['college'] = data[i]['college'][0]
    data[i]['department'] = data[i]['department'][0].replace('\xa0', '')
    data[i]['id'] = [data[i]['url'][0].replace('https://freshman.tw/cross/104/', ''), data[i]['college'] + ' ' + data[i]['department']]
    
    applicant_num = 0
    for j in range(len(data[i]['department_attr'])):
        data[i]['department_attr'][j] = data[i]['department_attr'][j] + data[i]['department_attr_val'][j]
        data[i]['department_attr'][j]= data[i]['department_attr'][j].replace('\xa0', '')
    
    for k in range(len(data[i]['overlap_college'])):
        data[i]['overlap_college'][k] = data[i]['overlap_college'][k] + data[i]['overlap_college_num'][k] 
    data[i]['overlap_college'].append(data[i]['overlap_college_num'][-1])
    
    data[i]['applied_region'] = ', '.join(data[i]['applied_region']).replace('\t\t\t    \r\n\t\t\t  ', '').replace(': ', '')

    for key in ['department_attr_val', 'overlap_college_num']:
        del data[i][key]

data[1]

{'college': '國立臺灣大學',
 'department': '護理學系(公費生)',
 'department_attr': ['招生名額：2',
  '實際錄取：1',
  '預計甄試人數：6',
  '實際甄試人數：8',
  '第二階段正取人數：2',
  '第二階段備取人數：0'],
 'overlap_college': ['臺北醫學大學 護理學系(公費生): 3', '其他(3個以下的校系): 10'],
 'applied_region': '台北x3, 桃園x1, 新竹x1, 台中x1, 宜蘭x1',
 'url': ['https://freshman.tw/cross/104/001602'],
 'rtrv_date': ['2019-03-24 11:14:24'],
 'id': ['001602', '國立臺灣大學 護理學系(公費生)']}

### Retrieve Node ID Lookup Table

In [3]:
node_id = [''] * len(data)
for i in range(len(data)):
    node_id[i] = data[i]['id']

node_id.sort()

def lookup_id(name):
    global node_id
    
    for i in range(len(node_id)):
        if name == node_id[i][1]:
            return(node_id[i][0])
    
    return(None)

## Construct Network Data

In [4]:
network_data = []

for i in range(len(data)):
    for j in range(len(data[i]['overlap_college'])):
        string = data[i]['overlap_college'][j]
        comma_idx = string.find(':')
        
        ego_id = data[i]['id'][0]
        department_id = lookup_id(string[:comma_idx])
        edge_weight = int(string[comma_idx + 1:])
        
        if department_id == None:
            link = None
            #if string[:comma_idx] == '其他(3個以下的校系)':
                #link = [{ego_id, 'others'}, edge_weight]
            #else:
                #link = None
        else:
            link = [{ego_id, department_id}, edge_weight]
        
        if link != None:
            network_data.append(link)

network_data

[[{'001012', '001032'}, 16],
 [{'001012', '001022'}, 7],
 [{'001012', '001082'}, 3],
 [{'001012', '001162'}, 3],
 [{'001012', '001062'}, 3],
 [{'001012', '001422'}, 3],
 [{'001012', '001072'}, 3],
 [{'001312', '001592'}, 10],
 [{'001112', '001592'}, 9],
 [{'001362', '001592'}, 9],
 [{'001582', '001592'}, 8],
 [{'001252', '001592'}, 6],
 [{'001132', '001592'}, 5],
 [{'001542', '001592'}, 5],
 [{'001342', '001592'}, 3],
 [{'001282', '001592'}, 3],
 [{'001422', '001592'}, 3],
 [{'001352', '001592'}, 3],
 [{'001372', '001592'}, 3],
 [{'001282', '001582'}, 9],
 [{'001372', '001582'}, 8],
 [{'001582', '001592'}, 8],
 [{'001402', '001582'}, 6],
 [{'001352', '001582'}, 5],
 [{'001412', '001582'}, 5],
 [{'001522', '001582'}, 4],
 [{'001382', '001582'}, 4],
 [{'001272', '001582'}, 3],
 [{'001342', '001582'}, 3],
 [{'001132', '001582'}, 3],
 [{'001292', '001582'}, 3],
 [{'001452', '001582'}, 3],
 [{'001562', '001572'}, 23],
 [{'001552', '001572'}, 18],
 [{'001492', '001572'}, 16],
 [{'001482', '0

## Remove Duplicated Links

In [5]:
idx_to_remove = []
for i in range(len(network_data)):
    dyad_set = network_data[i][0]
    
    # Search through the remaining ties
    for j in range(i + 1, len(network_data)):
        if dyad_set == network_data[j][0]:
            idx_to_remove.append(j)


# Use reversed order, to deal with out-of-index problem
for idx in sorted(idx_to_remove, reverse=True):
    del network_data[idx]

network_data

[[{'001012', '001032'}, 16],
 [{'001012', '001022'}, 7],
 [{'001012', '001082'}, 3],
 [{'001012', '001162'}, 3],
 [{'001012', '001062'}, 3],
 [{'001012', '001422'}, 3],
 [{'001012', '001072'}, 3],
 [{'001312', '001592'}, 10],
 [{'001112', '001592'}, 9],
 [{'001362', '001592'}, 9],
 [{'001582', '001592'}, 8],
 [{'001252', '001592'}, 6],
 [{'001132', '001592'}, 5],
 [{'001542', '001592'}, 5],
 [{'001342', '001592'}, 3],
 [{'001282', '001592'}, 3],
 [{'001422', '001592'}, 3],
 [{'001352', '001592'}, 3],
 [{'001372', '001592'}, 3],
 [{'001282', '001582'}, 9],
 [{'001372', '001582'}, 8],
 [{'001402', '001582'}, 6],
 [{'001352', '001582'}, 5],
 [{'001412', '001582'}, 5],
 [{'001522', '001582'}, 4],
 [{'001382', '001582'}, 4],
 [{'001272', '001582'}, 3],
 [{'001342', '001582'}, 3],
 [{'001132', '001582'}, 3],
 [{'001292', '001582'}, 3],
 [{'001452', '001582'}, 3],
 [{'001562', '001572'}, 23],
 [{'001552', '001572'}, 18],
 [{'001492', '001572'}, 16],
 [{'001482', '001572'}, 10],
 [{'001462', '

## Write Data

### Edge Data

In [6]:
with open('network_data/ntuNetwork_edges.csv', 'w') as f:
    f.write('node1,node2,edgeWeight\n')
    for lst in network_data:
        string = ','.join(lst[0]) + ',' + str(lst[1]) + '\n'
        f.write(string)

### Attribute Data

In [7]:
with open('network_data/ntuNetwork_attr.csv', 'w') as f:
    f.write('id,department,recruit_quota,applied_region\n')
    
    for dic in data:
        # recruit_quota
        string = dic['department_attr'][0][5:]
        string = string.replace('(外加)', '')
        recruit_quota = str(eval(string))
        
        """
        applicant_num = 0
        for string in dic['overlap_college']:
            comma_idx = string.find(':')
            applicant_num += int(string[comma_idx + 1:])
        applicant_num = str(applicant_num)
        """
        
        outstring = dic['id'][0] + ',' + dic['department'] + ',' + recruit_quota + ',"' + dic['applied_region'] + '"\n'
        f.write(outstring)