In [1]:
import pandas as pd

df = pd.read_csv('Fraud_Data.csv', parse_dates=['purchase_time', 'signup_time'])

In [8]:
examples_in = [
    732758368,
    350311387,
    2621473820,
    3840542443,
    415583117,
]

In [9]:
example_out = [
    '43.173.1.96',
    '20.225.83.219',
    '156.64.132.28',
    '228.234.6.235',
    '24.197.75.141',
]

In [31]:
def convert_to_standard(ip_address_int):
    """
    Convert integer format IP addres to the standard 4 octets
    """
    octets = []
    
    ip_address_hex = hex(ip_address_int)
    for index in range(2, 10, 2):
        part = ip_address_hex[index:index + 2]
        octet = int(part, 16)
        octets.append(str(octet))
    
    return ".".join(octets)

In [32]:
test_cases = zip(examples_in, example_out)

for trial, ans in test_cases:
    try:
        assert ans == convert_to_standard(trial)
    except AssertionError:
        print(trial, ans)

In [30]:
convert_to_standard(int(df['ip_address'].iloc[0]))

'43.186.173.208.1.22.96.0'

In [33]:
int(df['ip_address'].iloc[0])

732758368

In [34]:
convert_to_standard(examples_in[0])

'43.173.1.96'

In [45]:
def convert(x):
    x = int(x)
    
    try:
        return convert_to_standard(x)
    except ValueError:
        return None

In [48]:
df['ip_address'].apply(convert).isnull().sum()

634

In [41]:
df['ip_address'].isnullll().sum()

0

In [50]:
100 * 600 / df['ip_address'].count()

0.39705648790301235

In [51]:
df_out = df.assign(ip_address_standard=lambda df: df['ip_address'].apply(convert))

In [52]:
df_out.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_address_standard
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,43.173.1.96
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,20.225.83.219
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,156.64.132.28
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,228.234.6.235
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,24.197.75.141


In [54]:
import requests

ip_address = df_out['ip_address_standard'].iloc[0]

response = requests.get(f'https://ipapi.co/{ip_address}/json/')

In [55]:
response.status_code

200

In [56]:
response.json()

{'ip': '43.173.1.96',
 'version': 'IPv4',
 'city': 'Haidian',
 'region': 'Beijing',
 'region_code': 'BJ',
 'country': 'CN',
 'country_name': 'China',
 'country_code': 'CN',
 'country_code_iso3': 'CHN',
 'country_capital': 'Beijing',
 'country_tld': '.cn',
 'continent_code': 'AS',
 'in_eu': False,
 'postal': None,
 'latitude': 39.9881,
 'longitude': 116.2846,
 'timezone': 'Asia/Shanghai',
 'utc_offset': '+0800',
 'country_calling_code': '+86',
 'currency': 'CNY',
 'currency_name': 'Yuan Renminbi',
 'languages': 'zh-CN,yue,wuu,dta,ug,za',
 'country_area': 9596960.0,
 'country_population': 1392730000,
 'asn': None,
 'org': None}

In [67]:
dir(df_out['ip_address_standard'].str)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__frozen',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_doc_args',
 '_freeze',
 '_get_series_list',
 '_inferred_dtype',
 '_is_categorical',
 '_is_string',
 '_make_accessor',
 '_orig',
 '_parent',
 '_validate',
 '_wrap_result',
 'capitalize',
 'casefold',
 'cat',
 'center',
 'contains',
 'count',
 'decode',
 'encode',
 'endswith',
 'extract',
 'extractall',
 'find',
 'findall',
 'fullmatch',
 'get',
 'get_dummies',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'islower',
 'isnumeric',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'len',
 'ljust',
 'lower',
 'lstrip',
 'match',
 'normalize',
 'pad',
 

In [71]:
df_out = df_out.assign(first_octet=lambda df: df['ip_address_standard'].str.extract(r'^(\d+)\.'))

In [64]:
df_out['ip_address_standard'].iloc[:5]

0      43.173.1.96
1    20.225.83.219
2    156.64.132.28
3    228.234.6.235
4    24.197.75.141
Name: ip_address_standard, dtype: object

In [72]:
df_out['first_octet'].nunique()

240

In [73]:
# Create a mapping between the first part of the octet to the country

In [78]:
df_out.drop_duplicates('first_octet')

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_address_standard,first_octet
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,0,43.173.1.96,43
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,0,20.225.83.219,20
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1,156.64.132.28,156
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,0,228.234.6.235,228
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,0,24.197.75.141,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,181644,2015-01-06 21:15:53,2015-01-06 21:15:54,40,LSPKMBDJHUIXL,SEO,Chrome,M,47,6.054872e+07,1,57.190.102.12,57
1292,206628,2015-02-18 16:37:16,2015-02-24 19:45:05,29,HQYYAWLAWESBP,Ads,Chrome,F,37,2.252022e+08,0,214.196.254.9,214
1570,106708,2015-08-16 15:57:32,2015-10-02 00:48:02,11,YRTUQWLPEFIFU,Ads,FireFox,F,21,5.768338e+08,0,34.97.201.31,34
2385,3026,2015-08-03 17:02:53,2015-08-30 15:43:59,25,VZBPOYFPCVHSU,Ads,Chrome,F,38,7.584825e+08,0,45.53.134.94,45


In [79]:
# 1. iterate over this partial result
# 2. get each IP address
# 3. use the API to look up the country
# 4. save the result to a data structure
# e.g. {first_octet: country}

In [85]:
import time

In [87]:
def lookup_country(ip_address):
    time.sleep(2.5)
    response = requests.get(f'https://ipapi.co/{ip_address}/json/')
    
    if response.ok:
        return response.json()['country_name']
    else:
        return None

In [90]:
first_octet_to_country = {}

for row in df_out.drop_duplicates('first_octet').itertuples():
    print(row.ip_address_standard)
    first_octet_to_country[row.first_octet] = lookup_country(row.ip_address_standard)

43.173.1.96
20.225.83.219
156.64.132.28
228.234.6.235


KeyError: 'country_name'

In [91]:
first_octet_to_country

{'43': 'China', '20': 'Canada', '156': 'United States'}

## Decorators

In [97]:
def sub(a, b):
    return a - b


def prob(a, b):
    return a * b

In [96]:
prob(-1, 2)

-2

In [93]:
sub(1, 2)

-1

In [94]:
# what if I never want a non-negative number
# of the result of any arithmatic operation

In [98]:
# what if I want to modify the behavior of these functions
# such that they return 0 if the result is supposed to be negative

In [100]:
def sub_mod(a, b):
    result = a - b
    
    if result < 0:
        return 0
    else:
        return result
    

def prob_sub(a, b):
    result = a * b
    
    if result < 0:
        return 0
    else:
        return result

In [101]:
def enforce_non_negative(result):
    if result < 0:
        return 0
    else:
        return result

In [102]:
def enforce_non_negative(func):
    def wrapper(a, b):
        result = func(a, b)
        
        if result < 0:
            return 0
        else:
            return result
    
    return wrapper

In [103]:
sub_decorated = enforce_non_negative(sub)

In [104]:
sub_decorated(1, 2)

0

In [105]:
@enforce_non_negative
def sub(a, b):
    return a - b

# sub = enforce_non_negative(sub)