In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import geopandas as gpd
import random
import ipaddress

In [2]:
np.set_printoptions(suppress=True)
pd.set_option('display.max_columns', None)

In [3]:
os.getcwd()

'C:\\Users\\iaros\\My_documents\\Education\\projects\\fraud_detection_01\\notebooks'

In [4]:
os.chdir("..")

In [5]:
clients = pd.read_csv("./data/cleaned_data/client_df_clean.csv")
district_ru = pd.read_csv("./data/cleaned_data/district_ru.csv")

# Генерация ip-адресов для клиентов
Понадобится для онлайн покупок и, возможно, переводов

In [6]:
clients_with_geo = clients.merge(district_ru, left_on="district_id", right_on="district_code").drop(columns=["district_code", "clients"])
clients_with_geo.head()

Unnamed: 0,client_id,district_id,birth_date,sex,region,area,timezone,lat,lon,population,geometry
0,1,18,1970-12-13,female,Рязанская,Рязань,UTC+3,54.625457,39.735999,525062,"POLYGON ((39.5366736 54.6385048, 39.5367682 54..."
1,2,1,1945-02-04,male,Москва,Москва,UTC+3,55.753879,37.620373,11514330,"MULTIPOLYGON (((37.290502 55.8019897, 37.29542..."
2,3,1,1940-10-09,female,Москва,Москва,UTC+3,55.753879,37.620373,11514330,"MULTIPOLYGON (((37.290502 55.8019897, 37.29542..."
3,4,5,1956-12-01,male,Ростовская,Ростов-на-Дону,UTC+3,47.222436,39.718787,1091544,"POLYGON ((39.4709911 47.2048727, 39.4838092 47..."
4,5,5,1960-07-03,female,Ростовская,Ростов-на-Дону,UTC+3,47.222436,39.718787,1091544,"POLYGON ((39.4709911 47.2048727, 39.4838092 47..."


In [7]:
ru_77_cities_series = pd.Series(clients_with_geo.area.unique())
ru_77_cities_series

0               Рязань
1               Москва
2       Ростов-на-Дону
3         Петрозаводск
4            Ульяновск
            ...       
72         Стерлитамак
73    Набережные Челны
74           Астрахань
75            Владимир
76         Новокузнецк
Length: 77, dtype: object

In [8]:
ru_ip_ranges = pd.read_csv("./data/raw_data/geo/Russia_and_Belarus_IP_ranges", header=None)

In [9]:
ru_ip_ranges.columns = ["range"]

In [10]:
ru_ip_ranges["range"] = ru_ip_ranges["range"].str.strip()

In [11]:
ru_ip_ranges.shape

(22508, 1)

In [12]:
ru_ip_ranges.head()

Unnamed: 0,range
0,2.60.0.0/19
1,2.60.32.0/19
2,2.60.64.0/19
3,2.60.96.0/19
4,2.60.128.0/19


In [13]:
ru_ip_ranges_50 = ru_ip_ranges[:50].copy()
ru_ip_ranges_50.shape

(50, 1)

In [14]:
# функция извлечения возможных ip-адресов из диапазона

def get_ips_from_range(ip_range):
    unpacked_ips = []
    # Создаём объект сети
    network = ipaddress.ip_network(ip_range)
    
    # Вывод всех хостов в сети (исключая сетевой и broadcast-адрес)
    for ip in network.hosts():
        unpacked_ips.append(str(ip))
    return pd.Series(unpacked_ips)

In [15]:
# непосредственно извлечение ip-адресов при помощи функции

unpacked_ips_glob = pd.concat([get_ips_from_range(ip_range) for ip_range in ru_ip_ranges_50.range.values], ignore_index=True)
unpacked_ips_glob.nunique()

606876

In [16]:
unpacked_ips_glob.tail()

606871    5.8.31.250
606872    5.8.31.251
606873    5.8.31.252
606874    5.8.31.253
606875    5.8.31.254
dtype: object

## Присвоение ip клиентам и потенциальным мошенникам

In [17]:
clients_with_geo.head()

Unnamed: 0,client_id,district_id,birth_date,sex,region,area,timezone,lat,lon,population,geometry
0,1,18,1970-12-13,female,Рязанская,Рязань,UTC+3,54.625457,39.735999,525062,"POLYGON ((39.5366736 54.6385048, 39.5367682 54..."
1,2,1,1945-02-04,male,Москва,Москва,UTC+3,55.753879,37.620373,11514330,"MULTIPOLYGON (((37.290502 55.8019897, 37.29542..."
2,3,1,1940-10-09,female,Москва,Москва,UTC+3,55.753879,37.620373,11514330,"MULTIPOLYGON (((37.290502 55.8019897, 37.29542..."
3,4,5,1956-12-01,male,Ростовская,Ростов-на-Дону,UTC+3,47.222436,39.718787,1091544,"POLYGON ((39.4709911 47.2048727, 39.4838092 47..."
4,5,5,1960-07-03,female,Ростовская,Ростов-на-Дону,UTC+3,47.222436,39.718787,1091544,"POLYGON ((39.4709911 47.2048727, 39.4838092 47..."


In [18]:
clients_rows = clients_with_geo.shape[0]
clients_rows

5369

In [19]:
clients_with_geo["home_ip"] = unpacked_ips_glob[:clients_rows].copy()
clients_with_geo.head()

Unnamed: 0,client_id,district_id,birth_date,sex,region,area,timezone,lat,lon,population,geometry,home_ip
0,1,18,1970-12-13,female,Рязанская,Рязань,UTC+3,54.625457,39.735999,525062,"POLYGON ((39.5366736 54.6385048, 39.5367682 54...",2.60.0.1
1,2,1,1945-02-04,male,Москва,Москва,UTC+3,55.753879,37.620373,11514330,"MULTIPOLYGON (((37.290502 55.8019897, 37.29542...",2.60.0.2
2,3,1,1940-10-09,female,Москва,Москва,UTC+3,55.753879,37.620373,11514330,"MULTIPOLYGON (((37.290502 55.8019897, 37.29542...",2.60.0.3
3,4,5,1956-12-01,male,Ростовская,Ростов-на-Дону,UTC+3,47.222436,39.718787,1091544,"POLYGON ((39.4709911 47.2048727, 39.4838092 47...",2.60.0.4
4,5,5,1960-07-03,female,Ростовская,Ростов-на-Дону,UTC+3,47.222436,39.718787,1091544,"POLYGON ((39.4709911 47.2048727, 39.4838092 47...",2.60.0.5


### Выгрузка clients_with_geo в csv

In [20]:
# данные понадобятся в других ноутбуках

clients_with_geo.to_csv("./data/cleaned_data/clients_with_geo.csv", index=False)

In [21]:
ru_77_cities_series.head()

0            Рязань
1            Москва
2    Ростов-на-Дону
3      Петрозаводск
4         Ульяновск
dtype: object

In [22]:
# умножим количество имеющихся городов в 100 раз для добавления 7700 мошеннических/подозрительных ip адресов

online_fraud_cities = pd.concat([ru_77_cities_series for _ in range(100)], ignore_index=True)
print(online_fraud_cities.shape)
online_fraud_cities.tail()

(7700,)


7695         Стерлитамак
7696    Набережные Челны
7697           Астрахань
7698            Владимир
7699         Новокузнецк
dtype: object

In [23]:
# возьмем 7700 ip с конца серии. home ip клиентов мы сзяли с начала серии. Серия очень большая: 600+ тыс адресов
fraud_ip_ser = unpacked_ips_glob.iloc[np.r_[-online_fraud_cities.shape[0]:0]].reset_index(drop=True)
fraud_ip_ser

0       5.3.252.223
1       5.3.252.224
2       5.3.252.225
3       5.3.252.226
4       5.3.252.227
           ...     
7695     5.8.31.250
7696     5.8.31.251
7697     5.8.31.252
7698     5.8.31.253
7699     5.8.31.254
Length: 7700, dtype: object

In [24]:
# соединим названия городов и ip адреса

fraud_ips = pd.concat([online_fraud_cities, fraud_ip_ser], axis=1)
fraud_ips.columns = ["area", "fraud_ip"]
print(fraud_ips.shape)
fraud_ips.tail()

(7700, 2)


Unnamed: 0,area,fraud_ip
7695,Стерлитамак,5.8.31.250
7696,Набережные Челны,5.8.31.251
7697,Астрахань,5.8.31.252
7698,Владимир,5.8.31.253
7699,Новокузнецк,5.8.31.254


### Выгрузка fraud_ips в csv

In [25]:
fraud_ips.to_csv("./data/cleaned_data/fraud_ips.csv", index=False)