In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import requests
import seaborn as sns
import colorcet as cc
import time

In [None]:
df = pd.read_csv("logs.txt", sep=" ", names=["month", "day", "time", "user", "ip","port"])
df["dateTime"] = "2023-" + df["month"].astype(str) + "-" + df["day"].astype(str) + "T" + df["time"].astype(str)
df.dateTime = pd.to_datetime(df.dateTime)
df.drop(["month","day","time"], axis=1, inplace=True)

In [None]:
def get_location(ip, country, city):
    ip_address = ip
    response = requests.get("https://ipgeolocation.abstractapi.com/v1/?api_key=066c7d0f4e024bdd946b83581480e8ae"+'&ip_address='+ip_address).json()
    country[ip_address] = response.get("country")
    city[ip_address] = response.get("city")
    return country, city

#ips = df.ip.unique()
#country = {}
#city = {}
#for ip in ips:
#    country, city = get_location(ip, country, city)
#    time.sleep(1.1)
    
#df["country"] = df.ip.map(country)
#df["city"] = df.ip.map(city)


In [None]:
#df.to_csv("ramka.csv", index=False)

In [None]:
df = pd.read_csv("ramka.csv")
for index in df[df.city == "Nîmes"].index:
    df.at[index, "country"] = "France" 
print(df[df.city=="Nîmes"])

In [None]:
df.dateTime = pd.to_datetime(df.dateTime)
df.set_index('dateTime', drop=False, inplace=True)
df['dateTime'].groupby(pd.Grouper(freq='60Min')).count().plot(kind="bar", title="Liczba ataków w przedziałach czasowych (1h)")

In [None]:
# datetime(year, month, day, hour, minute, second)
countries = pd.DataFrame()
b = datetime(2023, 3, 27, 9, 0, 0)
for i in range(0,34):
    sliceDf = df[(df.dateTime>b+timedelta(hours=i)) & (df.dateTime<b+timedelta(hours=i+1))]
    sliceDf = pd.concat([sliceDf.groupby("country").count()["user"]], keys=[f"{b+timedelta(hours=i)}"])
    countries = pd.concat([countries, sliceDf])
countries.index = pd.MultiIndex.from_tuples(countries.index, names=('fromDateTime', 'country'))
countries.columns = ["attack_count"]
display(countries)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(25,10))
palette = sns.color_palette(cc.glasbey, n_colors=45)
sns.stripplot(data=df.dropna(), x="country", y="dateTime", hue="city", ax=ax, palette=palette, alpha=0.5)
ax.legend(ncols=3)
ax.set_title("Time of attacks")

In [None]:
ips = df.ip.unique()
var = df.groupby("ip").size()
dicktionary = {}
for ip in ips:
    dicktionary[ip] = var[ip]
ipDf = pd.DataFrame(ips)
ipDf.columns = ["ip"]
ipDf["attackCount"] = ipDf.ip.map(dicktionary)
display(ipDf.head())

In [None]:
var = df.groupby(["ip"])["user"].nunique()
dicktionary = {}
for ip in ips:
    dicktionary[ip] = var[ip]
ipDf["usedUser"] = ipDf.ip.map(dicktionary)
display(ipDf.head())

In [None]:
import tqdm

#passwords = pd.read_csv("1000000-password-seclists.txt", names=["password"])
#var = df.groupby("ip")["user"].unique()
#dictionary = {}
#for ip in tqdm.tqdm(ips):
#    suma = 0
#    for password in var[ip]:
#        if password in passwords.password.unique():
#            suma += 1
#    dictionary[ip] = suma
#ipDf["accuracy"] = ipDf.ip.map(dictionary)
#ipDf.accuracy /= ipDf.usedUser
#display(ipDf)

In [None]:
#ipDf.to_csv("ipDf.csv", index=False)

In [None]:
ipDf = pd.read_csv("ipDf.csv")
display(ipDf)

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')
ax.scatter(xs=ipDf.attackCount, ys=ipDf.usedUser, zs=ipDf.accuracy)
ax.set_xlabel("attack count")
ax.set_ylabel("unique user")
ax.set_zlabel("accuracy (compared with file of the passwords)")
ax.set_box_aspect(aspect=None, zoom=0.8)
ax.set_title("Ips describe in 3D")

In [None]:
from sklearn.mixture import BayesianGaussianMixture
ipDf = ipDf.dropna()
ipDf["cluster"] = BayesianGaussianMixture(n_components=3, covariance_type='diag', n_init = 100, 
                              weight_concentration_prior_type= "dirichlet_distribution",
                              init_params="random_from_data", random_state=0).fit_predict(ipDf.iloc[:,1:])

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')
ax.scatter(xs=ipDf.attackCount, ys=ipDf.usedUser, zs=ipDf.accuracy, c=ipDf.cluster)
ax.set_xlabel("attack count")
ax.set_ylabel("unique user")
ax.set_zlabel("accuracy (compared with file of the passwords)")
ax.set_box_aspect(aspect=None, zoom=0.8)
ax.set_title("Clustered ips")

In [None]:
nieszkodliwi = ipDf[ipDf.cluster == 2]
szkodliwi = ipDf[ipDf.cluster == 1]
grozni = ipDf[ipDf.cluster == 0]

In [None]:
nieszkodliwi.describe().loc[["mean","min","max","count"],["attackCount","usedUser","accuracy"]]

In [None]:
szkodliwi.describe().loc[["mean","min","max","count"],["attackCount","usedUser","accuracy"]]

In [None]:
grozni.describe().loc[["mean","min","max","count"],["attackCount","usedUser","accuracy"]]

In [None]:
dictionaries = {}
for ip in ipDf.ip:
    dictionaries[ip]=int(ipDf[ipDf.ip==ip].cluster)
df["cluster"] = df.ip.map(dictionaries)
display(df.head())

In [None]:
var = df[["ip","country","cluster"]].drop_duplicates().reset_index(drop=True)

In [None]:
data = var.groupby(["country","cluster"]).count().unstack(fill_value=0)
data.columns = ["grozni","szkodliwi","nieszkodliwi"]
display(data)

In [None]:
barWidth = 0.2
fig = plt.subplots(figsize =(22, 5))

br1 = np.arange(len(var.dropna().country.unique()))
br2 = [x + barWidth for x in br1]
br3 = [x + barWidth for x in br2]

plt.bar(br1, data.nieszkodliwi, color ='r', width = barWidth,
        edgecolor ='grey', label ='nieszkodliwi')
plt.bar(br2, data.szkodliwi, color ='g', width = barWidth,
        edgecolor ='grey', label ='szkodliwi')
plt.bar(br3, data.grozni, color ='b', width = barWidth,
        edgecolor ='grey', label ='grozni')
 
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks([r + barWidth for r in range(len(br1))],["UK" if x == "United Kingdom" else "USA" if x == "United States" else "SK" if x == "South Korea" else x for x in data.index])
plt.xlim([-0.5,len(var.dropna().country.unique())])

plt.title("ip classification")
plt.legend()
plt.show()

In [None]:
display(df.groupby("cluster")["ip"].count())

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
colors = ["b","g","r"]
ax.pie(df.groupby("cluster")["ip"].count(), labels=["grozni", "szkodliwi", "nieszkodliwi"], colors=["b","g","r"])
ax.set_title("Udział danej kategorii we wszystkich atakach")

In [None]:
from matplotlib.lines import Line2D

fig, ax = plt.subplots(1,1,figsize=(25,10))
palette = ["blue","green","red"]
g = sns.stripplot(data=df.dropna(), x="country", y="dateTime", hue="cluster", ax=ax, palette=palette, alpha=0.5)
legend_elements = [
                    Line2D([0], [0], marker='o', color='w', label='Grozni', markerfacecolor='b', markersize=15),
                    Line2D([0], [0], marker='o', color='w', label='szkodliwi', markerfacecolor='g', markersize=15),
                    Line2D([0], [0], marker='o', color='w', label='nieszkodliwi', markerfacecolor='r', markersize=15), 
                    ]
ax.legend(handles=legend_elements, ncols=3)
ax.set_title("Time of attacks")

In [None]:
var = df.groupby(["ip"])["user"].unique()
docs = [x for x in var]
words = pd.DataFrame([docs]).T
words.columns = ["set"]

In [None]:
import Levenshtein
for index1, row1 in words.iterrows():
    values = []
    for index2, row2 in words.iterrows():
        suma = 0
        for word1 in row1.set:
            distances = []
                
            for word2 in row2.set:
                distances.append(Levenshtein.distance(str(word1), str(word2)))
            suma += min(distances)    
            
        suma /= len(row1.set)
        if suma == 0:
            suma = np.nan
        
        values.append(suma)
    words[index1] = values

In [None]:
words = words.fillna(0)
words["sizeD"] = [len(x) for x in words.set]
words["sizeD"] = [np.emath.logn(3,x) for x in words.sizeD]
words.iloc[:,1:]=(words.iloc[:,1:]-words.iloc[:,1:].min())/(words.iloc[:,1:].max()-words.iloc[:,1:].min())

In [75]:
from sklearn.cluster import SpectralClustering
words["cluster"] = SpectralClustering(n_clusters=6, assign_labels='cluster_qr', random_state=0).\
                    fit_predict(words.iloc[:,1:])




In [None]:
dicts = [[] for i in range(6)]
for i in range(6):
    for elem in words[words.cluster==i].set:
        for x in elem:
            dicts[i].append(x)
    dicts[i] = list(set(dicts[i]))

In [None]:
display(words.iloc[:,1:82].head())

In [None]:
from sklearn.manifold import TSNE
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

tsne = TSNE(n_components=3, perplexity=10, init="pca", learning_rate="auto", random_state=0)
tsnePlot = tsne.fit_transform(words.iloc[:,1:82])

fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(projection='3d')
p = ax.scatter(xs=tsnePlot[:,0], ys=tsnePlot[:,1], zs=tsnePlot[:,2], c=words.cluster, cmap=cm.get_cmap('turbo', 6))
ax.set_title("Visualize dict clusters")
fig.colorbar(p)

In [None]:
dictionary = {}
for i in range(len(ips)):
    dictionary[ips[i]] = words.iloc[i,-1]
ipDf["dict"] = ipDf.ip.map(dictionary)

In [None]:
data = ipDf.groupby(["cluster", "dict"])["ip"].count().unstack(fill_value=0)
data.index=["grozni","szkodliwi","nieszkodliwi"]
display(data)

In [None]:
from matplotlib.ticker import MaxNLocator
fig, ax = plt.subplots(1,1,figsize=(10,5))

barWidth = 0.1
br1 = np.arange(3)
br2 = [x + barWidth for x in br1]
br3 = [x + barWidth for x in br2]
br4 = [x + barWidth for x in br3]
br5 = [x + barWidth for x in br4]
br6 = [x + barWidth for x in br5]
br = [br1,br2,br3,br4,br5,br6]
colors = ["red","green","blue","magenta","orange","gray"]
for i in range(6):
    ax.bar(br[i], data.iloc[::-1,i], color = colors[i], width = barWidth,
            edgecolor ='grey', label =f'dict {i}')
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_xlabel("Attack type")
ax.set_ylabel("ip number")
ax.set_title("Typ ataku, a używany słownik")
plt.xticks([r + barWidth*2.5 for r in range(len(br1))],["nieszkodliwi","szkodliwi","grozni"])
plt.xlim([-0.5,3])
plt.legend()

In [None]:
dicts[0]

In [None]:
dicts[1]

In [None]:
print(dicts[2][:10])
print(len(dicts[2]))

In [None]:
print(dicts[3][:10])
print(len(dicts[3]))

In [None]:
print(dicts[4][:10])
print(len(dicts[4]))

In [None]:
dicts[5]

In [None]:
len([x for x in dicts[2] if not x in dicts[3]])

In [None]:
len([x for x in dicts[4] if not x in dicts[3]])

In [None]:
len([x for x in dicts[2] if not x in dicts[4]])

In [None]:
dictionary = {}
for index, row in df[["country","ip"]].drop_duplicates().iterrows():
    dictionary[row.ip] = row.country
ipDf["country"] = ipDf.ip.map(dictionary)
data = ipDf.groupby(["country","dict"])["ip"].count().unstack(fill_value=0)
display(data)

In [None]:
import random
from itertools import compress

barWidth = 0.2
fig,ax = plt.subplots(2,3,figsize =(20, 10))
random.seed(0)
random_colors = ["#"+''.join([random.choice('ABCDEF0123456789') for i in range(6)]) for j in range(len(data))]
ax[0,0].pie(data[data.iloc[:,0]!=0].iloc[:,0], labels=data[data.iloc[:,0]!=0].index, colors=compress(random_colors,(data.iloc[:,0]!=0).to_numpy()))
ax[0,1].pie(data[data.iloc[:,1]!=0].iloc[:,1], labels=data[data.iloc[:,1]!=0].index, colors=compress(random_colors,(data.iloc[:,1]!=0).to_numpy()))
ax[0,2].pie(data[data.iloc[:,2]!=0].iloc[:,2], labels=data[data.iloc[:,2]!=0].index, colors=compress(random_colors,(data.iloc[:,2]!=0).to_numpy()))
ax[1,0].pie(data[data.iloc[:,3]!=0].iloc[:,3], labels=data[data.iloc[:,3]!=0].index, colors=compress(random_colors,(data.iloc[:,3]!=0).to_numpy()))
ax[1,1].pie(data[data.iloc[:,4]!=0].iloc[:,4], labels=data[data.iloc[:,4]!=0].index, colors=compress(random_colors,(data.iloc[:,4]!=0).to_numpy()))
ax[1,2].pie(data[data.iloc[:,5]!=0].iloc[:,5], labels=data[data.iloc[:,5]!=0].index, colors=compress(random_colors,(data.iloc[:,5]!=0).to_numpy()))
ax[0,0].set_title("wqmarlduiqkmgs"+"  dict 0")
ax[0,1].set_title("['sap_adm01', 'Admin', 'centos', 'temp1', 'admin']"+"  dict 1")
ax[0,2].set_title("len: "+str(len(dicts[2]))+"  dict 2")
ax[1,0].set_title("len: "+str(len(dicts[3]))+"  dict 3")
ax[1,1].set_title("len: "+str(len(dicts[4]))+"  dict 4")
ax[1,2].set_title("['ubnt', 'root']"+"  dict 5")

In [None]:
display(ipDf[ipDf.dict==5])

In [None]:
df[df.country=="France"].groupby(["country","cluster"])["city"].count()

In [None]:
df[df.user=="kiosk"]

In [None]:
dicts[4]