New Journal IP Extraction
===

This script processes the json journals in the new (2019) dataset to a CSV file containing the IP metadata.


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter
import sqlite3
from nltk import word_tokenize
from html.parser import HTMLParser
from tqdm import tqdm
import random
import pickle
import json

from datetime import datetime
from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

In [9]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

PosixPath('/panfs/roc/groups/3/srivbane/levon003/repos/sna-social-support')

In [10]:
import sys
caringbridge_core_path = "/home/srivbane/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [11]:
import cbcore.data.paths as paths
import cbcore.data.dates as dates

In [12]:
raw_data_dir = paths.raw_data_2019_filepath

In [13]:
journal_filepath = os.path.join(raw_data_dir, "journal.json")

In [14]:
working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/geo_data"
os.makedirs(working_dir, exist_ok=True)
assert os.path.exists(working_dir)

### Load and convert journal file

In [15]:
output_filepath = os.path.join(working_dir, "journal_ip_raw.csv")

In [16]:
def extract_long(json_value):
    if type(json_value) == int:
        return json_value
    elif type(json_value) == str:
        return int(json_value)
    elif type(json_value) == dict:
        return int(json_value['$numberLong'])
    elif json_value == 0.0:  # special case for userId, which I assume is essentially "missing/unknown" or user deleted?
        return None
    elif type(json_value) == float:
        return int(json_value)
    else:
        raise ValueError(f"Type '{type(json_value)}' not implemented: {str(json_value)}")

In [18]:
bad_ips = []
with open(output_filepath, 'w') as outfile:
    with open(journal_filepath, encoding='utf-8') as infile:
        for line in tqdm(infile, total=19137078):
            journal = json.loads(line)
            
            if "ip" not in journal:
                continue
            ip = journal['ip']
            if not re.match(r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$", ip):
                bad_ips.append(ip)
                continue
                            
            journal_oid = journal['_id']['$oid']
            site_id = extract_long(journal['siteId'])
            user_id = extract_long(journal['userId'])
            
            created_at = dates.get_date_from_json_value(journal['createdAt'])
            updated_at = dates.get_date_from_json_value(journal['updatedAt'])

            outfile.write(f"{user_id},{ip},{site_id},{journal_oid},{created_at},{updated_at}\n")

100%|██████████| 19137078/19137078 [28:25<00:00, 11221.57it/s]


In [19]:
len(bad_ips)

170220

In [20]:
Counter(bad_ips).most_common()[:10]

[('', 159585),
 ('unknown', 2789),
 ('70.41.106.10, 1', 205),
 ('10.0.0.109, unk', 204),
 ('10.0.0.75, unkn', 149),
 ('10.0.0.108, unk', 100),
 ('127.0.0.1, 127.', 73),
 ('10.56.26.39, 10', 71),
 ('unknown, 204.18', 57),
 ('192.168.3.22, 1', 54)]

In [21]:
!wc -l {output_filepath}

13255023 /home/srivbane/shared/caringbridge/data/projects/sna-social-support/geo_data/journal_ip_raw.csv
