In [2]:
import pandas as pd
import numpy as np
import json
import glob, os, subprocess
import geocoder
import time
import requests
import inspect


In [3]:
logs = pd.read_csv('/home/kin672/gentb-summer22/Geography/gentb.haproxy.log', sep="\t", header=None)
logs.columns = ["original"]

In [5]:
# Isolate the ip addresses
temp_ip = logs['original'].str.split(':',expand=True)
logs['ip'] = ''
for index, row in temp_ip.iterrows():
    row = np.array(row)
    check = 'messages' in row[0]
    if check:
        ip = row[4]
    else:
        ip = row[3]
    if ip == '':
        print(row)
    ip = ip.replace(' ', '')
    logs.at[index, 'ip'] = ip

In [4]:
# Isolate the UUIDs
temp_uuid= logs['original'].str.split('/',expand=True)
logs['uuid'] = ''
for index, row in temp_uuid.iterrows():
    row = np.array(row)
    row_len = np.array([len(str(x)) for x in row])
    uuids = list(row[np.where(row_len == 32)])
    if len(uuids) == 1:
        uuid = uuids[0]
        if ' ' in uuid or '=' in uuid or '-' in uuid:
            uuid = 'No UUID'
        logs.at[index, 'uuid'] = uuid
    elif len(uuids) == 0:
        logs.at[index, 'uuid'] = 'No UUID'
    else:
        logs.at[index, 'uuid'] = 'Too many UUIDs?'

In [None]:
# Download the unique ips as a csv
pd.DataFrame(logs.ip.unique()).to_csv('/home/kin672/gentb-summer22/Geography/unique_ips.csv', index = False)

In [None]:
# Convert a list of ip addresses to full set of location info [NOT WORKING]
def get_locations(fNames):

    latlng = []
    country = []
    region = []
    city = []
    
    for ip in fNames:
        try:
            proc = subprocess.Popen(f"curl ipinfo.io/{ip}?token=80efdb52ccb9b2", shell=True, encoding='utf8', stdout=subprocess.PIPE)
            output = proc.communicate()[0]

            latlng.append(output.loc)
            country.append(output.country)
            region.append(output.region)
            city.append(output.city)
        except:
            latlng.append('Failed')
            country.append('Failed')
            region.append('Failed')
            city.append('Failed')
        
    return pd.DataFrame({"IP": fNames, "LatLng": latlng, 'Country': country, 'region': region, 'city' : city})


In [None]:
# Alternative method if you have enough calls to ipinfo (how to use API key?) 
ip_guide = {}
for ip in logs.ip.unique():
    temp = {'latlng':[], 'country':[], 'city':[], 'region': []}
    geoitem = geocoder.ip(ip)
    time.sleep(30)
    temp['latlng'] = geoitem.latlng
    temp['country'] = geoitem.country
    temp['city'] = geoitem.city
    temp['region'] = geoitem.region
    ip_guide[ip] = temp

In [None]:
# Sanjana's Method that actually works
# Pulling location info from IPs
unique_ips = pd.read_csv("unique_ips.csv", header=None)
unique_ips.columns = ["ip"]

addresses = []

with requests.Session() as session:
    for _, row in unique_ips.iterrows():
        
        # store addresses so that they don't have to be searched again
        addresses.append(geocoder.ip(row["ip"], session=session))
        
        # pause to reduce the likelihood of having too many requests per minute
        time.sleep(0.1)
        
        
for i, row in unique_ips.iterrows():

    try:
        lat_lng = addresses[i].latlng
        unique_ips.loc[i, "lat"] = lat_lng[0]
        unique_ips.loc[i, "lng"] = lat_lng[1]

        unique_ips.loc[i, "location"] = ", ".join([addresses[i].city, addresses[i].state, addresses[i].country])
    except:
        print(addresses[i])
        unique_ips.loc[i, "location"] = None
        unique_ips.loc[i, "lat"] = np.nan
        unique_ips.loc[i, "lng"] = np.nan

In [6]:
ip_to_location = pd.read_csv('/home/kin672/gentb-summer22/Geography/unique_ips_geocoder.csv')

Unnamed: 0,ip,location,lat,lng
0,40.77.167.99,"Boydton, Virginia, US",36.6676,-78.3875
1,40.77.167.67,"Boydton, Virginia, US",36.6676,-78.3875
2,84.88.186.196,"Barcelona, Catalonia, ES",41.3888,2.1590
3,52.43.50.21,"Boardman, Oregon, US",45.8399,-119.7006
4,145.255.74.252,"Muscat, Muscat, OM",23.5841,58.4078
...,...,...,...,...
611,120.235.173.52,"Shenzhen, Guangdong, CN",22.5455,114.0683
612,51.158.108.61,"Paris, Île-de-France, FR",48.8534,2.3488
613,17.121.112.23,"Dearing, Kansas, US",37.0587,-95.7133
614,207.244.224.209,"St. Louis, Missouri, US",38.6273,-90.1979


In [7]:
logs[:, ['uuid', 'ip']].to_csv

Unnamed: 0,original,uuid,ip
0,messages-20220703.gz:Jun 26 03:48:37 dmzlb-hap...,No UUID,40.77.167.99
1,messages-20220703.gz:Jun 26 03:48:48 dmzlb-hap...,No UUID,40.77.167.67
2,messages-20220703.gz:Jun 26 04:31:26 dmzlb-hap...,No UUID,40.77.167.99
3,messages-20220703.gz:Jun 26 04:31:32 dmzlb-hap...,No UUID,40.77.167.67
4,messages-20220703.gz:Jun 26 05:02:44 dmzlb-hap...,No UUID,40.77.167.67
...,...,...,...
88861,Jul 27 14:38:36 dmzlb-haproxy-prod01 haproxy[2...,No UUID,207.244.224.209
88862,Jul 27 14:38:37 dmzlb-haproxy-prod01 haproxy[2...,No UUID,207.244.224.209
88863,Jul 27 14:41:37 dmzlb-haproxy-prod01 haproxy[2...,No UUID,40.77.167.66
88864,Jul 27 15:28:48 dmzlb-haproxy-prod01 haproxy[2...,No UUID,54.221.184.6
