# Imports

In [55]:
import pandas as pd
import numpy as np
import os

# Graphing
import matplotlib.pyplot as plt
import seaborn as sns

# IP Address
import ipaddress as ip

# Encoding
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

# Machine Learning
from sklearn.model_selection import train_test_split

# NLP
import texthero as hero
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [56]:
geoip2_df = pd.read_csv("Data/geoip2-ipv4.csv")

In [57]:
ipv4_df = pd.read_csv("Data/ipv4.csv")

In [58]:
ipv6_proxy_df = pd.read_csv("Data/IP2PROXY-IPV6.CSV")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
ipv4_proxy_df = pd.read_csv("Data/IP2PROXY-IPV4.CSV")

In [6]:
ipv4_proxy_df.shape

(1511846, 14)

In [7]:
ipv4_proxy_df.head()

Unnamed: 0,16778241,16778241.1,PUB,AU,Australia,Victoria,Melbourne,WirefreeBroadband Pty Ltd,wirefreebroadband.com.au,ISP,38803,WirefreeBroadband Pty Ltd.1,30,-
0,16778497,16778497,PUB,AU,Australia,Victoria,Melbourne,WirefreeBroadband Pty Ltd,wirefreebroadband.com.au,ISP,38803,WirefreeBroadband Pty Ltd,30,-
1,16783399,16783399,PUB,JP,Japan,Tokyo,Tokyo,I2TS Inc.,i2ts.com,DCH,-,-,30,-
2,16783571,16783571,PUB,JP,Japan,Tokyo,Tokyo,I2TS Inc.,i2ts.com,DCH,-,-,30,-
3,16804078,16804078,PUB,JP,Japan,Shimane,Izumo,Energia Communications Inc.,enecom.co.jp,ISP,18144,Energia Communications Inc.,30,-
4,16809988,16809988,PUB,TH,Thailand,Chiang Rai,Pa Daet,TOT Public Company Limited,tot.co.th,ISP/MOB,23969,TOT Public Company Limited,30,-


In [8]:
ip.IPv4Address(3758089480)

IPv4Address('223.255.229.8')

In [9]:
ipv6_proxy_df.shape

(1508772, 14)

In [10]:
ipv6_proxy_df.head()

Unnamed: 0,281470698521601,281470698521601.1,PUB,AU,Australia,Victoria,Melbourne,WirefreeBroadband Pty Ltd,wirefreebroadband.com.au,ISP,38803,WirefreeBroadband Pty Ltd.1,30,-
0,281470698521857,281470698521857,PUB,AU,Australia,Victoria,Melbourne,WirefreeBroadband Pty Ltd,wirefreebroadband.com.au,ISP,38803,WirefreeBroadband Pty Ltd,30,-
1,281470698526759,281470698526759,PUB,JP,Japan,Tokyo,Tokyo,I2TS Inc.,i2ts.com,DCH,-,-,30,-
2,281470698526931,281470698526931,PUB,JP,Japan,Tokyo,Tokyo,I2TS Inc.,i2ts.com,DCH,-,-,30,-
3,281470698547438,281470698547438,PUB,JP,Japan,Shimane,Izumo,Energia Communications Inc.,enecom.co.jp,ISP,18144,Energia Communications Inc.,30,-
4,281470698553348,281470698553348,PUB,TH,Thailand,Chiang Rai,Pa Daet,TOT Public Company Limited,tot.co.th,ISP/MOB,23969,TOT Public Company Limited,30,-


In [11]:
ip.IPv6Interface(281474439837645)

IPv6Interface('::ffff:dfff:f7cd/128')

In [12]:
geoip2_df.head()

Unnamed: 0,Network,geoname_id,continent_code,continent_name,country_iso_code,country_name,is_anonymous_proxy,is_satellite_provider
0,41.74.160.0/20,49518.0,AF,Africa,RW,Rwanda,False,False
1,41.77.160.0/22,49518.0,AF,Africa,RW,Rwanda,False,False
2,41.138.80.0/21,49518.0,AF,Africa,RW,Rwanda,False,False
3,41.186.0.0/16,49518.0,AF,Africa,RW,Rwanda,False,False
4,41.197.0.0/16,49518.0,AF,Africa,RW,Rwanda,False,False


# EDA

In [13]:
# Looking at column informaion/data types
ipv4_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172754 entries, 0 to 172753
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   IP_Networks         172754 non-null  object
 1   is_anonymous_proxy  172754 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 1.5+ MB


In [14]:
ipv4_df.describe().T

Unnamed: 0,count,unique,top,freq
IP_Networks,172754,172754,206.49.103.0/24,1
is_anonymous_proxy,172754,2,False,172497


In [15]:
ipv4_df["is_anonymous_proxy"] == "False"

0         False
1         False
2         False
3         False
4         False
          ...  
172749    False
172750    False
172751    False
172752    False
172753    False
Name: is_anonymous_proxy, Length: 172754, dtype: bool

# Univariate Anlysis

In [16]:
# Checking how many Ture and False values there are
ipv4_df["is_anonymous_proxy"].unique()

array([False,  True])

In [17]:
ipv4_df["is_anonymous_proxy"].value_counts(normalize=True) # There aren't that many using a proxy

False    0.998512
True     0.001488
Name: is_anonymous_proxy, dtype: float64

In [18]:
ipv4_df["is_anonymous_proxy"].value_counts()

False    172497
True        257
Name: is_anonymous_proxy, dtype: int64

# Cleaning Data

1. Remove and rename columns in ipv4 proxy

In [19]:
ipv4_proxy_df.columns

Index(['16778241', '16778241.1', 'PUB', 'AU', 'Australia', 'Victoria',
       'Melbourne', 'WirefreeBroadband Pty Ltd', 'wirefreebroadband.com.au',
       'ISP', '38803', 'WirefreeBroadband Pty Ltd.1', '30', '-'],
      dtype='object')

In [20]:
# Renaming the two columns that will be kept
ipv4_proxy_df.rename(columns={"16778241": "IP_Networks", "PUB": "is_anonymous_proxy"}, inplace=True)

In [21]:
# Removing all columns except IP_Networks and is_anonymous_proxy
ipv4_proxy_df = ipv4_proxy_df[["IP_Networks", "is_anonymous_proxy"]]

2. Change data types

In [22]:
# Changing data type on ipv4_df in the is_anonymous_proxy to a string
ipv4_df["is_anonymous_proxy"] = ipv4_df["is_anonymous_proxy"].astype(str)

In [23]:
# Changing data type to a string
ipv4_proxy_df["is_anonymous_proxy"] = ipv4_proxy_df["is_anonymous_proxy"].astype(str)

3. Change 2nd columns values in ipv4 proxy and convert first column to IP address

In [24]:
# Converting ipv4 proxy IP_Networks columns in IP addresses
ipv4_proxy_df["IP_Networks"] = ipv4_proxy_df["IP_Networks"].apply(lambda x: ip.IPv4Address(x).exploded)

In [25]:
# Changing the values of the is_anonymous_proxy to True
ipv4_proxy_df["is_anonymous_proxy"] = ipv4_proxy_df["is_anonymous_proxy"].replace(to_replace="PUB", value="True")

In [26]:
ipv4_proxy_df["is_anonymous_proxy"].unique()

array(['True'], dtype=object)

4. Change IP_Networks into IP Addresses

In [27]:
# Using a lambda function to get IP addresses from the specified column
ipv4_df["IP_Networks"] = ipv4_df["IP_Networks"].apply(lambda x: ip.ip_interface(x).ip.exploded)

In [28]:
ipv4_df.head()

Unnamed: 0,IP_Networks,is_anonymous_proxy
0,41.74.160.0,False
1,41.77.160.0,False
2,41.138.80.0,False
3,41.186.0.0,False
4,41.197.0.0,False


6. Combine ipv4 and ipv4 proxy datasets

In [29]:
# Perfoming a concat
ipv4_new_df = pd.concat(objs=[ipv4_df, ipv4_proxy_df], axis=0, ignore_index=True)

In [30]:
ipv4_new_df.head()

Unnamed: 0,IP_Networks,is_anonymous_proxy
0,41.74.160.0,False
1,41.77.160.0,False
2,41.138.80.0,False
3,41.186.0.0,False
4,41.197.0.0,False


In [31]:
ipv4_new_df.shape

(1684600, 2)

In [32]:
# Checking target variable classes
ipv4_new_df["is_anonymous_proxy"].value_counts(normalize=True)

True     0.897604
False    0.102396
Name: is_anonymous_proxy, dtype: float64

7. Downsample/re-sample dataset

In [33]:
# Getting rows that are equal to True and rows that are equal to Flase
ipv4_True = ipv4_new_df.loc[ipv4_new_df["is_anonymous_proxy"] == "True"]
ipv4_False = ipv4_new_df.loc[ipv4_new_df["is_anonymous_proxy"] == "False"]

In [34]:
# Randomly selecting 100_000 rows from each dataframe of True and False
ipv4_random_true = ipv4_True.sample(n=10_000, random_state=1)
ipv4_random_false = ipv4_False.sample(n=10_000, random_state=1)

In [35]:
# Combining True and False datasets
ipv4_final_df = pd.concat(objs=[ipv4_random_false, ipv4_random_true], join="inner", axis=0, ignore_index=True)

In [36]:
ipv4_final_df.duplicated().sum()

0

8. Removing decimal points and creating one number

In [37]:
# Replacing the period with an empty string to combine into one number
ipv4_final_df["IP_Networks"] = ipv4_final_df["IP_Networks"].apply(lambda x: x.replace(".", ""))

In [38]:
ipv4_final_df.head()

Unnamed: 0,IP_Networks,is_anonymous_proxy
0,90361800,False
1,2133887160,False
2,41209640,False
3,11981247128,False
4,19412020,False


10. Creating predictor and target variables and turning it into a numpy array

In [49]:
# Independent and Dependent Variable
X = ipv4_final_df[["IP_Networks"]]
y = ipv4_final_df[["is_anonymous_proxy"]]

In [50]:
# Turning X and y into numpy arrays
# X = X.to_numpy()
# y = y.to_numpy()

11. Train, Test, and Validation Split

In [51]:
# Train, test, and validation split on data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Creating validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [52]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((12000,), (4000,), (12000, 1), (4000, 1))

In [53]:
X_val.shape, y_val.shape

((4000,), (4000, 1))

11. Encoding

In [None]:
# Using LabelBinarizer for IP Addresses
lb = LabelBinarizer()  # Instantiating class
lb_ip = LabelBinarizer()

In [None]:
# Encoding dependent variable
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)
y_val = lb.fit_transform(y_val)

NLP

In [None]:
# Using NLP
# X_train = lb_ip.fit_transform(X_train)
# X_test = lb_ip.fit_transform(X_test)
# X_val = lb_ip.fit_transform(X_val)

# Changing Dataframe into arrays
X_train = X_train.apply(lambda x: np.array(x))  # Changing list into an array of numbers
X_test = X_test.apply(lambda x: np.array(x))
X_val = X_val.apply(lambda x: np.array(x))

In [None]:
X_train.shape

(12000, 11997)