In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import env

from datetime import date

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Acquire

##### function that accesses db server and querries logs and cohorts tables from the curriculm_logs db.

In [2]:
#db access
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

#curriculum_logs db
log_sql = "SELECT *\
              FROM logs\
              LEFT JOIN cohorts ON logs.cohort_id = cohorts.id;"

##### grabs dataset from get_connection function above and formats it into a pandas dataframe.

In [3]:
#acquires curriculum_logs dataset
def get_log_data():
    return pd.read_sql(log_sql,get_connection('curriculum_logs'))

##### sets the queried datframe as 'df'.

In [4]:
df = get_log_data()
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0


##### create a dictionary with the class types to append to the df by 'program id'.

In [5]:
class_type_dict = {'id':[1, 2, 3, 4],
        'Name':['PHP Full Stack Web Development', 
                'Java Full Stack Web Development', 
                'Data Science', 
                'Front End Web Development'],
        'subdomain':['php','java','ds','fe']}
class_type = pd.DataFrame(class_type_dict)
print (class_type)

   id                             Name subdomain
0   1   PHP Full Stack Web Development       php
1   2  Java Full Stack Web Development      java
2   3                     Data Science        ds
3   4        Front End Web Development        fe


##### merges df and class_type datasets

In [6]:
df = df.merge(class_type, how='left', left_on='program_id', right_on='id')

##### caches dataset for accessibility.

In [7]:
df.to_csv('/Users/hector/codeup-data-science/anomaly-detection-project/log_data.csv')

# Prepare

In [8]:
df.head()

Unnamed: 0,date,time,path,user_id,cohort_id,ip,id_x,name,slack,start_date,end_date,created_at,updated_at,deleted_at,program_id,id_y,Name,subdomain
0,2018-01-26,09:55:03,/,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,1.0,PHP Full Stack Web Development,php
1,2018-01-26,09:56:02,java-ii,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,1.0,PHP Full Stack Web Development,php
2,2018-01-26,09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,1.0,PHP Full Stack Web Development,php
3,2018-01-26,09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8.0,Hampton,#hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,,1.0,1.0,PHP Full Stack Web Development,php
4,2018-01-26,09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22.0,Teddy,#teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,,2.0,2.0,Java Full Stack Web Development,java


In [9]:
df.shape

(900223, 18)

In [10]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,900223.0,458.825707,249.296767,1.0,269.0,475.0,660.0,981.0
cohort_id,847330.0,48.501049,32.795482,1.0,28.0,33.0,57.0,139.0
id_x,847330.0,48.501049,32.795482,1.0,28.0,33.0,57.0,139.0
program_id,847330.0,2.086004,0.388231,1.0,2.0,2.0,2.0,4.0
id_y,847330.0,2.086004,0.388231,1.0,2.0,2.0,2.0,4.0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900223 entries, 0 to 900222
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        900223 non-null  object 
 1   time        900223 non-null  object 
 2   path        900222 non-null  object 
 3   user_id     900223 non-null  int64  
 4   cohort_id   847330 non-null  float64
 5   ip          900223 non-null  object 
 6   id_x        847330 non-null  float64
 7   name        847330 non-null  object 
 8   slack       847330 non-null  object 
 9   start_date  847330 non-null  object 
 10  end_date    847330 non-null  object 
 11  created_at  847330 non-null  object 
 12  updated_at  847330 non-null  object 
 13  deleted_at  0 non-null       object 
 14  program_id  847330 non-null  float64
 15  id_y        847330 non-null  float64
 16  Name        847330 non-null  object 
 17  subdomain   847330 non-null  object 
dtypes: float64(4), int64(1), object(13)
memory u

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 900223 entries, 0 to 900222
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   date        900223 non-null  object 
 1   time        900223 non-null  object 
 2   path        900222 non-null  object 
 3   user_id     900223 non-null  int64  
 4   cohort_id   847330 non-null  float64
 5   ip          900223 non-null  object 
 6   id_x        847330 non-null  float64
 7   name        847330 non-null  object 
 8   slack       847330 non-null  object 
 9   start_date  847330 non-null  object 
 10  end_date    847330 non-null  object 
 11  created_at  847330 non-null  object 
 12  updated_at  847330 non-null  object 
 13  deleted_at  0 non-null       object 
 14  program_id  847330 non-null  float64
 15  id_y        847330 non-null  float64
 16  Name        847330 non-null  object 
 17  subdomain   847330 non-null  object 
dtypes: float64(4), int64(1), object(13)
memory u

In [13]:
#merge date and time columns
df['date'] = df['date'] +' '+ df['time']#concat time and date

In [14]:
#datetime conversion and set index   
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()

In [15]:
#drop columns. these columns have no use.
df = df.drop(columns=['time', 'id_x', 'slack', 'id_y', 'Name', 'deleted_at'])

In [16]:
#rename columns
df = df.rename(columns = {'name':'cohort', 'start_date':'class_start_date', 'end_date':'class_end_date'})    

In [18]:
df.isnull().sum()

path                    1
user_id                 0
cohort_id           52893
ip                      0
cohort              52893
class_start_date    52893
class_end_date      52893
created_at          52893
updated_at          52893
program_id          52893
subdomain           52893
dtype: int64

In [19]:
df = df.dropna()

In [20]:
#converts dtypes for listed features
convert_dict_int = {'cohort_id':int, 'program_id':int, 'class_start_date':'datetime64[ns]', 'class_end_date':'datetime64[ns]', 'created_at':'datetime64[ns]', 'updated_at':'datetime64[ns]'}
df = df.astype(convert_dict_int)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 847329 entries, 2018-01-26 09:55:03 to 2021-04-21 16:44:39
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   path              847329 non-null  object        
 1   user_id           847329 non-null  int64         
 2   cohort_id         847329 non-null  int64         
 3   ip                847329 non-null  object        
 4   cohort            847329 non-null  object        
 5   class_start_date  847329 non-null  datetime64[ns]
 6   class_end_date    847329 non-null  datetime64[ns]
 7   created_at        847329 non-null  datetime64[ns]
 8   updated_at        847329 non-null  datetime64[ns]
 9   program_id        847329 non-null  int64         
 10  subdomain         847329 non-null  object        
dtypes: datetime64[ns](4), int64(3), object(4)
memory usage: 77.6+ MB


In [22]:
df.head()

Unnamed: 0_level_0,path,user_id,cohort_id,ip,cohort,class_start_date,class_end_date,created_at,updated_at,program_id,subdomain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2018-01-26 09:55:03,/,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1,php
2018-01-26 09:56:02,java-ii,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1,php
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1,php
2018-01-26 09:56:06,slides/object_oriented_programming,1,8,97.105.19.61,Hampton,2015-09-22,2016-02-06,2016-06-14 19:52:26,2016-06-14 19:52:26,1,php
2018-01-26 09:56:24,javascript-i/conditionals,2,22,97.105.19.61,Teddy,2018-01-08,2018-05-17,2018-01-08 13:59:10,2018-01-08 13:59:10,2,java


In [24]:
df.path.value_counts()

/                                    45854
javascript-i                         18203
toc                                  17591
search/search_index.json             17534
java-iii                             13166
                                     ...  
html                                     1
Exploration_Lecture                      1
content/javascript-with-html.html        1
annotations                              1
users.json                               1
Name: path, Length: 2224, dtype: int64

In [25]:
path_list = df.path.values.tolist()

In [26]:
path_list

['/',
 'java-ii',
 'java-ii/object-oriented-programming',
 'slides/object_oriented_programming',
 'javascript-i/conditionals',
 'javascript-i/loops',
 'javascript-i/conditionals',
 'javascript-i/functions',
 'javascript-i/loops',
 'javascript-i/functions',
 'mkdocs/search_index.json',
 'javascript-i/introduction/working-with-data-types-operators-and-variables',
 '/',
 'javascript-i',
 'javascript-i/introduction/working-with-data-types-operators-and-variables',
 'javascript-i/introduction/variables',
 'javascript-i/introduction/operators',
 'javascript-i/introduction/working-with-data-types-operators-and-variables',
 'javascript-i/functions',
 'javascript-i/functions',
 'javascript-i/functions',
 'javascript-i/conditionals',
 'javascript-i/functions',
 'mkdocs/search_index.json',
 'git/merge-conflict-demo',
 'mkdocs/search_index.json',
 'git/working-in-teams',
 'javascript-i/introduction/primitive-types',
 'javascript-i/introduction/operators',
 'javascript-i/functions',
 '/',
 'javascr