# Identifying Fraudulent Activities

In [1]:
import pandas as pd
import numpy as np


## Background

You only have information about the user first transaction on the site and based on that you
have to make your classification ("fraud/no fraud").

## Objective

To build a machine learning model that predicts the probability that the first transaction of a new user is fraudulent.

1. Determine each user's country based on the numeric IP address.
2. Build a predictive model and explain the cost of false positives vs false negatives 
3. What kinds of users are more likely to be classified as at risk? What are their characteristics?
4. From a product perspective, how would you use it?

In [2]:
fraud_data = pd.read_csv('Fraud_Data.csv')
ip_to_country_data = pd.read_csv('IpAddress_to_Country.csv')

In [3]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [4]:
ip_to_country_data.head()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [5]:
# all the users are unique in this table
print(len(fraud_data.user_id) == len(fraud_data.user_id.unique()))
print(len(fraud_data.user_id) == len(fraud_data.ip_address))

True
True


In [24]:
# step 1: take out the ip_address column and check to see where they belongs
# create a column called country
# add it back to the table

def mapping_ip_to_country(df1, df2):

    country_serie = []

    for ip in df1.ip_address:
    
        country = df2[(df2.lower_bound_ip_address <= ip) & \
                      (df2.upper_bound_ip_address >= ip)].country.values

        try:
            
            country_serie.append(country[0])
            
        except:
            country_serie.append(np.nan)
    
    #df1['country'] = country_serie
    #return df1
    
    return country_serie


In [25]:
country_serie = mapping_ip_to_country(fraud_data, ip_to_country_data)

In [26]:
country_table = pd.DataFrame(country_serie, columns = ['country'])

In [27]:
country_table.head()

Unnamed: 0,country
0,Japan
1,United States
2,United States
3,
4,United States


In [28]:
fraud_data_new = pd.concat([fraud_data, country_table], axis = 1)

In [29]:
fraud_data_new.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States
