# IT Academy - Data Science amb Python
## Tasca 7: Estructura de control

###  [Github Registre de Logs](https://github.com/jesussantana/Registre_de_logs)

###  Exercici 1
- Normalitza, identifica i enumera cada un dels atributs / variables de l'estructura de l'arxiu "Web_access_log-akumenius.com" que trobaràs al repositori de GitHub "Data-sources".

In [None]:
import numpy as np
import pandas as pd
import itertools
import requests
import pickle
import json
import time
import re
import io 

import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from datetime import datetime
from datetime import timedelta
from joblib import Parallel, delayed
from pandas import json_normalize
from shapely.geometry import Point, Polygon
#from data_cleaner import DataCleaner

pd.set_option('display.max_columns', None)

- We load the data to check how it has been distributed
  - The variables that we will use:
    - 'DNS','IP','Date','Time','Request','Status','Size','Referer','UserAgent'

In [None]:
path = '../Data/Web_access_log-akumenius.com.txt'

Logs_raw = pd.read_csv(path, sep='\s | \- | \"', names =['DNS','ip','Date','Time','Request','Status','Size','Referer','UserAgent'], engine='python')

Logs_copy = Logs_raw.copy()

Logs_copy.head()

###  Exercici 2
- Neteja, preprocesa, estructura i transforma (dataframe) les dades del registre d'Accés a la web.

- Check rows and columns

In [None]:
Logs_copy.tail()

In [None]:
Logs_copy.shape

- Check for null data

In [None]:
Logs_copy.isnull().sum()

In [None]:
Logs_copy.head()

In [None]:
Logs_copy.tail()

- Reorder columns of data

In [None]:
Logs_copy.UserAgent = Logs_copy.Request
Logs_copy.Request = Logs_copy.Date
Logs_copy.Referer = Logs_copy.Time
Logs_copy.Date = Logs_copy.ip
Logs_copy.Time = Logs_copy.ip = np.nan

Logs_copy.head()

- DNS & IP data split

In [None]:
Logs_copy[['DNS','ip']] = Logs_copy.DNS.str.split('\s', expand = True).get([0, 1])

In [None]:
Logs_copy.head()

- Check how many different Ips exist

In [None]:
Logs_copy.DNS.unique()

In [None]:
len(Logs_copy.ip.unique())

- Time data extraction

In [None]:
Logs_copy.Time = Logs_copy.Date.str.extract(':(\d{2}:\d{2}:\d{2}.*)]')

Logs_copy.head()

- Date data extraction

In [None]:
Logs_copy.Date = Logs_copy.Date.str.extract('(\d+/\w+/\d+)')

Logs_copy.Date = pd.to_datetime(Logs_copy.Date, format = '%d/%b/%Y')

In [None]:
Logs_copy.head()

- Request & Status data split

In [None]:
Logs_copy[['Request','Status']] = Logs_copy.Request.str.split('\"', expand = True).get([0, 1])

In [None]:
Logs_copy.tail()

In [None]:
Logs_copy.head()

- Size data extraction

In [None]:
Logs_copy.Size = Logs_copy.Status.str.extract('(\d+$)')
Logs_copy

- Clean Size data

In [None]:
%%time

Logs_copy.Size = Logs_copy.Size.apply(lambda x: (np.nan if x == '200' else x))

In [None]:
Logs_copy

- Status data extraction

In [None]:
Logs_copy.Status = Logs_copy.Status.str.extract('(\d{3})')
Logs_copy.tail()

- Clean Referer data

In [None]:
%%time

Logs_copy.Referer = Logs_copy.Referer.apply(lambda x: (np.nan if re.search('-"', x) else x.rstrip(x[-1])))

In [None]:
Logs_copy

###  Exercici 3
- Geolocalitza les IP's. Aqui tens una pagina de interes:
  - [freegeoip](https://freegeoip.app/)

- We export Ips file for security

In [None]:
Ips_export= Logs_copy.ip.copy()

Ips_export.replace('', 'null', inplace = True)

Ips_export.to_csv('../Data/Ips_export.csv', index = False)

- Ips file recovery

In [None]:
path = '../Data/Ips_export.csv'

Ips_raw = pd.read_csv(path, sep= 'delimiter', engine='python')

Ips_raw.head()

In [None]:
Ips_raw.tail()

- Make a copy of the data to be used and we check them

In [None]:
ips_unique = pd.DataFrame({"ip": Ips_raw.ip.unique()})

In [None]:
len(ips_unique)

In [None]:
ips_unique

- Function for extract Information freegeoip

In [None]:
'''localhost = "127.0.0.1"
geolocation = []
url = "https://freegeoip.app/json/"
headers = {
    'accept': "application/json",
    'content-type': "application/json"
    }

def extract_info(ip):

    try:
        response = requests.request("GET", url + ip)
        return geolocation.append(eval(response.text))

    except:
        return np.nan'''

- Ips Information Extraction

In [None]:
'''%%time

geolocation = Parallel(n_jobs = 8, backend = "multiprocessing")(map(delayed(extract_info), ips_unique.ip))'''

- joblib.Parallel uses the backend module to start worker processes, executing tasks simultaneously on separate CPUs.
- Less than 50 seconds for obtaining the IPS 2921 extraction, too much faster, when using 4 cores and 8 threads

In [None]:
#%%time
localhost = "127.0.0.1"
url = "https://freegeoip.app/json/"

geolocation = []

for ip in ips_unique.ip:
    try:
        response = requests.request("GET", url + ip)
        geolocation.append(eval(response.text))
    except TypeError:
        pass

In [None]:
geolocation_df = pd.DataFrame.from_dict(geolocation)

- check the data obtained

In [None]:
geolocation_df

Export Geolocations File

In [None]:
geolocation_df_export = geolocation_df.copy()

geolocation_df_export.replace('', 'null', inplace = True)

geolocation_df_export.to_csv('geolocation_df_export.csv', index = False)

In [None]:
path = 'geolocation_df_export.csv'

geolocation_df = pd.read_csv(path, sep= ',', engine='python')

geolocation_df.head()

In [None]:
geolocation_df.info()

- Do a merge by the column of IPs

In [None]:
Logs_copy2 = Logs_copy.merge(geolocation_df, on='ip', how='outer')

- Check the result

In [None]:
Logs_copy2.tail()

In [None]:
Logs_copy2.metro_code.unique()

###  Exercici 4
- Mostreu-me la teva creativitat, Sorprèn-me fes un pas més enllà amb el analysis anterior.

In progress ...

- Extract UserAgent Data

In [None]:
%%time

from device_detector import SoftwareDetector

Devices = Logs_copy2.UserAgent

device = Devices.apply(lambda x: SoftwareDetector(x).parse())

print(device)

- Assign the values

In [None]:
%%time

Logs_copy2['Client_Name'] = device.apply(lambda x: x.client_name())
Logs_copy2['Client_Type'] = device.apply(lambda x: x.client_type())
Logs_copy2['Client_Version'] = device.apply(lambda x: x.client_version())
Logs_copy2['Os_Name'] = device.apply(lambda x: x.os_name())
Logs_copy2['Os_Version'] = device.apply(lambda x: x.os_version())
Logs_copy2['Engine'] = device.apply(lambda x: x.engine())
Logs_copy2['Device_Brand_Name'] = device.apply(lambda x: x.device_brand_name())
Logs_copy2['Device_Model'] = device.apply(lambda x: x.device_model())
Logs_copy2['Device_Type'] = device.apply(lambda x: x.device_type())



- Rename & Reorder columns

In [None]:
Logs_copy2.columns = map(str.upper, Logs_copy2.columns)
Logs_copy2.columns

- Check the result

In [None]:
Logs_copy2.tail()

In [None]:
Logs_copy2.USERAGENT[0]

- The column from which we have extracted the data is deleted

In [None]:
del(Logs_copy2['USERAGENT'])

Logs_copy2.tail()

- Clean Client Data

In [None]:
%%time

Logs_copy2.CLIENT_NAME = Logs_copy2.CLIENT_NAME.apply(lambda x: (x[0:6] if re.search('Apache', x) else x))

In [None]:
Logs_copy2

- Visualize the data in progress

In [None]:
plt.figure(figsize = (14,7))

In [None]:

Logs_copy2.COUNTRY_NAME.value_counts(normalize=False).where(Logs_copy2.COUNTRY_NAME.value_counts() > 2000).plot(kind = 'pie', figsize = (14,7))

In [None]:
graph = Logs_copy2[['DNS', 'CITY']].groupby(['DNS']).count().sort_values(by = 'CITY',ascending = False)
graph = graph.rename(columns = {'CITY' : 'Frequency'})
graph.plot.bar(y = 'Frequency', color = 'b', ylabel = 'Frequency', legend = None, figsize = (14,7))
plt.show()

In [None]:
sns.set(rc={"figure.figsize": (20, 10)})
sns.displot(data = Logs_copy2, x = "DEVICE_TYPE", hue = "OS_NAME", multiple = "stack", ax= ax)

In [None]:
times['weekday'] = Logs_copy2['time'].dt.weekday.map({0 : 'Sunday', 1 : 'Monday', 2 : 'Tuesday', 3 : 'Wednesday', 4 : 'Thursday', 5 : 'Friday', 6 : 'Saturday'})
times['month'] = Logs_copy2['time'].dt.month
times['monthday'] = Logs_copy2['time'].dt.day
times['yearday'] = Logs_copy2['time'].dt.dayofyear
times['hour'] = Logs_copy2['time'].dt.hour

logs_clean

In [None]:
data = (times[['browser', 'hour', 'month']]
     .groupby(['browser', 'hour'])
     .count()   
     .reset_index()
     .rename(columns = {'month':'logs'})
     .sort_values('logs', ascending = False)
     .head(130)
)

ax = plt.subplots (figsize = (15,8))
ax = sns.lineplot(data = data, x = 'hour', y = 'logs', hue = 'browser')

In [None]:
graph = Logs_copy2[['STATUS', 'DNS']].groupby(['STATUS']).count().sort_values(by = 'DNS',ascending = False)
graph = graph.rename(columns = {'DNS' : 'Frequency'})
graph.plot.bar(y = 'Frequency', color = 'r', ylabel = 'Frequency', legend = None, figsize = (14,7))
plt.show()

In [None]:
graph = Logs_copy2[['CLIENT_TYPE', 'DNS']].groupby(['CLIENT_TYPE']).count().sort_values(by = 'DNS',ascending = False)
graph = graph.rename(columns = {'DNS' : 'Frequency'})
graph.plot.bar(y = 'Frequency', color = 'g', ylabel = 'Frequency', legend = None, figsize = (14,7))
plt.show()

In [None]:
sns.set(rc={"figure.figsize": (20, 10)})
sns.displot(data = Logs_copy2, x = "DEVICE_TYPE", hue = "DNS", multiple = "stack", ax= ax)

In [None]:
sns.set(rc={"figure.figsize": (20, 10)})
sns.displot(data = Logs_copy2, x = "OS_NAME", hue = "DEVICE_MODEL", multiple = "stack", ax= ax)

In [None]:
GEOLOCATIONS = Logs_copy2[Logs_copy2['LONGITUDE'] != ' '][['IP', 'LONGITUDE', 'LATITUDE']].astype({'LONGITUDE': float, 'LATITUDE': float})
GEOLOCATIONS = (GEOLOCATIONS[['IP', 'LONGITUDE', 'LATITUDE']].groupby('IP')
              .agg({'LONGITUDE' : np.mean, 'LATITUDE': np.mean, 'IP' : 'count'})
              .astype({'IP': float})
                       
            )
logs_byIP.info()

In [None]:
fig, ax = plt.subplots (figsize = (15,8))

plt.scatter(x = GEOLOCATIONS['LONGITUDE'],
            y = GEOLOCATIONS['LATITUDE'],
            s = GEOLOCATIONS['IP']*2/10,
            alpha = 0.16,
            c = 'darkgreen')

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.boundary.plot(ax = ax, figsize=(20,5), linewidth=0.25, edgecolor='black', color='black')

plt.ylim((-60,70))
plt.xlim((-130,150))

plt.title('Geolocations of web visits')

plt.show()

In [None]:
fig, ax = plt.subplots (figsize = (15,8))

plt.scatter(x = GEOLOCATIONS['LONGITUDE'],
            y = GEOLOCATIONS['LATITUDE'],
            s = GEOLOCATIONS['IP']*2/10,
           alpha = 0.16,
           c = 'darkgreen')

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.boundary.plot(ax = ax, figsize=(20,5), linewidth=0.25, edgecolor='black', color='black')

plt.ylim((25,60))
plt.xlim((-20,30))

plt.title('Zoom zone more density of visits to the web')

plt.show()

- Save the data obtained for later reuse

In [None]:
Logs_export = Logs_copy2.copy()

Logs_export.replace('', 'null', inplace = True)

Logs_export.to_csv('../Data/Logs_export.csv', index = False)