# IT Academy - Data Science amb Python
## Tasca 7: Estructura de control

###  [Github Registre de Logs](https://github.com/jesussantana/Registre_de_logs)

###  Exercici 1
- Normalitza, identifica i enumera cada un dels atributs / variables de l'estructura de l'arxiu "Web_access_log-akumenius.com" que trobaràs al repositori de GitHub "Data-sources".

In [None]:
import numpy as np
import pandas as pd
import requests
import pickle
import json
import time
import re
import io 

from datetime import datetime
from datetime import timedelta
from joblib import Parallel, delayed
from pandas import json_normalize

pd.set_option('display.max_columns', None)

- We load the data to check how it has been distributed
  - The variables that we will use:
    - 'DNS','IP','Date','Time','Request','Status','Size','Referer','UserAgent'

In [None]:
path = '../Data/Web_access_log-akumenius.com.txt'

Logs_raw = pd.read_csv(path, sep='\s | \- | \"', names =['DNS','ip','Date','Time','Request','Status','Size','Referer','UserAgent'], engine='python')

Logs_copy = Logs_raw.copy()

Logs_copy.head()

###  Exercici 2
- Neteja, preprocesa, estructura i transforma (dataframe) les dades del registre d'Accés a la web.

- Check rows and columns

In [None]:
Logs_copy.tail()

In [None]:
Logs_copy.shape

- Check for null data

In [None]:
Logs_copy.isnull().sum()

In [None]:
Logs_copy.head()

In [None]:
Logs_copy.tail()

- Reorder columns of data

In [None]:
Logs_copy.UserAgent = Logs_copy.Request
Logs_copy.Request = Logs_copy.Date
Logs_copy.Referer = Logs_copy.Time
Logs_copy.Date = Logs_copy.ip
Logs_copy.Time = Logs_copy.ip = np.nan

Logs_copy.head()

- DNS & IP data split

In [None]:
Logs_copy[['DNS','ip']] = Logs_copy.DNS.str.split('\s', expand = True).get([0, 1])

In [None]:
Logs_copy.head()

- Check how many different Ips exist

In [None]:
Logs_copy.DNS.unique()

In [None]:
len(Logs_copy.ip.unique())

- Time data extraction

In [None]:
Logs_copy.Time = Logs_copy.Date.str.extract(':(\d{2}:\d{2}:\d{2}.*)]')

Logs_copy.head()

- Date data extraction

In [None]:
Logs_copy.Date = Logs_copy.Date.str.extract('(\d+/\w+/\d+)')

Logs_copy.Date = pd.to_datetime(Logs_copy.Date, format = '%d/%b/%Y')

In [None]:
Logs_copy.head()

- Request & Status data split

In [None]:
Logs_copy[['Request','Status']] = Logs_copy.Request.str.split('\"', expand = True).get([0, 1])

In [None]:
Logs_copy.tail()

In [None]:
Logs_copy.head()

- Size data extraction

In [None]:
Logs_copy.Size = Logs_copy.Status.str.extract('(\d+$)')
Logs_copy

- Clean Size data

In [None]:
%%time

Logs_copy.Size = Logs_copy.Size.apply(lambda x: (np.nan if x == '200' else x))

In [None]:
Logs_copy

- Status data extraction

In [None]:
Logs_copy.Status = Logs_copy.Status.str.extract('(\d{3})')
Logs_copy.tail()

- Clean Referer data

In [None]:
%%time

Logs_copy.Referer = Logs_copy.Referer.apply(lambda x: (np.nan if re.search('-"', x) else x.rstrip(x[-1])))

In [None]:
Logs_copy

###  Exercici 3
- Geolocalitza les IP's. Aqui tens una pagina de interes:
  - [freegeoip](https://freegeoip.app/)

- We export Ips file for security

In [None]:
Ips_export= Logs_copy.ip.copy()

Ips_export.replace('', 'null', inplace = True)

Ips_export.to_csv('../Data/Ips_export.csv', index = False)

- Ips file recovery

In [None]:
path = '../Data/Ips_export.csv'

Ips_raw = pd.read_csv(path, sep= 'delimiter', engine='python')

Ips_raw.head()

In [None]:
Ips_raw.tail()

- Make a copy of the data to be used and we check them

In [None]:
Ips_copy = Ips_raw.copy()
ips_unique = Ips_copy.ip.unique()

In [None]:
len(Ips_copy.ip.unique())

In [None]:
ips_unique

- Function for extract Information freegeoip

In [None]:

localhost = "127.0.0.1"

Info_list = []

def extract_info(ip):

    try:
        response = urlopen("https://freegeoip.app/json/" + ip)
        return json.load(response)

    except:
        return np.nan

- Ips Information Extraction

In [None]:
extract_info(ips_unique[1000])

In [None]:
%%time

geolocation = Parallel(n_jobs = 8, backend = "multiprocessing")(map(delayed(extract_info), ips_unique))

- joblib.Parallel uses the backend module to start worker processes, executing tasks simultaneously on separate CPUs.
- Less than 50 seconds for obtaining the IPS 2921 extraction, too much faster, when using 4 cores and 8 threads

In [None]:
type(geolocation_list)

- check the data obtained

In [None]:
geolocation_list[1000]

In [None]:
len(geolocation_list)

Function for File to json

In [None]:
with open('geolocation.json', 'w') as file:
    json.dump(geolocation_list, file)

In [None]:
geolocation_list = json.loads(open('geolocation.json').read())
  

In [None]:
geolocation_list[1]

In [None]:
geolocation_df = pd.DataFrame.from_dict([geolocation_list], )

In [None]:
geolocation_df.head()

In [None]:
geolocation_df = 

In [None]:
geolocation_df = pd.DataFrame.from_list(geolocation_list)

In [None]:

geolocation_df = []  
for x in range(0, len(geolocation_list)-1):
    geolocation_df.append(geolocation_list[x])

geolocation_df[1]['country_name']

In [None]:
type(geolocation_list)

In [None]:
geolocation_list[1]['ip']

- Create variable to join the data obtained with the ones we had

In [None]:
geolocation_list[1000]

In [None]:
geolocation_df = geolocation_list.map()

In [None]:
geolocation.head()

In [None]:
len(geolocation)

- Do a merge by the column of IPs

In [None]:
Logs_copy = Logs_copy.merge(geolocation, on='ip', how='inner')

- Check the result

In [None]:
Logs_copy

###  Exercici 4
- Mostreu-me la teva creativitat, Sorprèn-me fes un pas més enllà amb el analysis anterior.

- Extract data from location information

In progress ...

- Extract UserAgent Data

In [None]:
%%time

from device_detector import SoftwareDetector

Devices = Logs_copy.UserAgent

device = Devices.apply(lambda x: SoftwareDetector(x).parse())

print(device)

- Assign the values

In [None]:
%%time

Logs_copy['Client_Name'] = device.apply(lambda x: x.client_name())
Logs_copy['Client_Type'] = device.apply(lambda x: x.client_type())
Logs_copy['Client_Version'] = device.apply(lambda x: x.client_version())
Logs_copy['Os_Name'] = device.apply(lambda x: x.os_name())
Logs_copy['Os_Version'] = device.apply(lambda x: x.os_version())
Logs_copy['Device_Type'] = device.apply(lambda x: x.device_type())

- Check the result

In [None]:
Logs_copy

In [None]:
Logs_copy.UserAgent[0]

- The column from which we have extracted the data is deleted

In [None]:
del(Logs_copy['UserAgent'])

Logs_copy.tail()

- Clean Client Data

In [None]:
%%time

Logs_copy.Client_Name = Logs_copy.Client_Name.apply(lambda x: (x[0:6] if re.search('Apache', x) else x))

In [None]:
Logs_copy

- Visualize the data in progress

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:

Logs_copy.Location.value_counts(normalize=False).where(Logs_copy.Location.value_counts() > 2000).plot(kind = 'pie', figsize = (10,6))

In [None]:
graph = Logs_copy[['DNS', 'Location']].groupby(['DNS']).count().sort_values(by = 'Location',ascending = False)
graph = graph.rename(columns = {'Location' : 'Frequency'})
graph.plot.bar(y = 'Frequency', color = 'g', ylabel = 'Frequency', legend = None, figsize = (10,6))
plt.show()

In [None]:
graph = Logs_copy[['Status', 'Time']].groupby(['Status']).count().sort_values(by = 'Time',ascending = False)
graph = graph.rename(columns = {'Time' : 'Frequency'})
graph.plot.bar(y = 'Frequency', color = 'r', ylabel = 'Frequency', legend = None, figsize = (10,6))
plt.show()

In [None]:
graph = Logs_copy[['Client_Type', 'Time']].groupby(['Client_Type']).count().sort_values(by = 'Time',ascending = False)
graph = graph.rename(columns = {'Time' : 'Frequency'})
graph.plot.bar(y = 'Frequency', color = 'y', ylabel = 'Frequency', legend = None, figsize = (10,6))
plt.show()

In [None]:
sns.displot(data = Logs_copy, x = "Os_Name", hue = "Date", multiple = "stack")

In [None]:
sns.displot(Logs_copy.Os_Name)

In [None]:
sns.displot(data = Logs_copy, x = "Device_Type", hue = "Date", multiple = "stack")

- Save the data obtained for later reuse

In [None]:
Logs_export = Logs_copy.copy()

Logs_export.replace('', 'null', inplace = True)

Logs_export.to_csv('../Data/Logs_export.csv', index = False)