In [None]:
import pandas as pd
import numpy as np

# Read and explore data

In [None]:
df = pd.read_csv('data/clickdata.csv')

In [None]:
df.head()

In [None]:
df.groupby(['ua_agent_class', 'visitor_recognition_type']).count()

Each row in 'df' contains a page request in a session.

CSV column definitions:
* epoch_ms: epoch in milliseconds
* session_id: session identifier
* country_by_ip_address: estimated country based on GeoIP lookup
* region_by_ip_address: estimated region based on GeoIP lookup
* url_without_parameters: 
* referrer_without_parameters: 
* visitor_recognition_type: ANONYMOUS, RECOGNIZED (by cookie) or LOGGEDIN
* ua_agent_class: the class label

In [None]:
# clean up different types of missing values
df = df.replace(np.nan, '', regex=True)
df = df.replace('Unknown', '', regex=True)

In [None]:
# lets look at some of the columns
df['visitor_recognition_type'].unique()

In [None]:
df['country_by_ip_address'].unique()

In [None]:
# Interesting values are 'Robot' and 'Browser' (not a robot)
print(df['ua_agent_class'].unique())

In [None]:
# Reduce the amount of detail in classes
# Merge all different Human types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Browser Webview','Browser')
# Merge all different 'non hunam' types
df['ua_agent_class'] = df['ua_agent_class'].str.replace('Robot Mobile','Robot')
print(df['ua_agent_class'].unique())

# Train a model

In [None]:
import pandas as pd
# select a few columns and transform them into features
X = pd.get_dummies(data=df[['country_by_ip_address', 'region_by_ip_address', 'visitor_recognition_type']], drop_first=True)
y = df['ua_agent_class']

In [None]:
# naively split the data and train a model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# pick an existing classifier algorithm
from sklearn.neighbors import KNeighborsClassifier
my_classifier = KNeighborsClassifier(n_jobs=-1)

In [None]:
my_classifier.fit(X_train, y_train)

# Evaluate the model

In [None]:
# model score
my_classifier.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = my_classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

# Predict a single element

In [None]:
# predict an individual data record
y_pred = my_classifier.predict([X_test.iloc[42]])[0]
y_real = y_test.iloc[42]
print(y_pred)
print(y_real)