In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

In [2]:
import env

In [3]:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [4]:
sql = '''
select *
from logs
join cohorts on logs.cohort_id = cohorts.id;
'''

In [5]:
df = pd.read_sql(sql, get_connection('curriculum_logs'))

In [6]:
df.index = pd.to_datetime(df.date + " " + df.time)

In [7]:
df = df.drop(columns=['date','time', 'created_at', 'updated_at', 'deleted_at'], axis=1)

In [8]:
# df.to_csv('curriculum_logs.csv')

In [9]:
# df = pd.read_csv('curriculum_logs.csv')

In [10]:
df.head()

Unnamed: 0,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,program_id
2018-01-26 09:55:03,/,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:02,java-ii,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2


In [11]:
df.head(10)

Unnamed: 0,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,program_id
2018-01-26 09:55:03,/,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:02,java-ii,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:05,java-ii/object-oriented-programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:06,slides/object_oriented_programming,1,8.0,97.105.19.61,8,Hampton,#hampton,2015-09-22,2016-02-06,1
2018-01-26 09:56:24,javascript-i/conditionals,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2
2018-01-26 09:56:41,javascript-i/loops,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2
2018-01-26 09:56:46,javascript-i/conditionals,3,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2
2018-01-26 09:56:48,javascript-i/functions,3,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2
2018-01-26 09:56:59,javascript-i/loops,2,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2
2018-01-26 09:58:26,javascript-i/functions,4,22.0,97.105.19.61,22,Teddy,#teddy,2018-01-08,2018-05-17,2


In [12]:
df.shape

(847330, 10)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 847330 entries, 2018-01-26 09:55:03 to 2021-04-21 16:44:39
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   path        847329 non-null  object 
 1   user_id     847330 non-null  int64  
 2   cohort_id   847330 non-null  float64
 3   ip          847330 non-null  object 
 4   id          847330 non-null  int64  
 5   name        847330 non-null  object 
 6   slack       847330 non-null  object 
 7   start_date  847330 non-null  object 
 8   end_date    847330 non-null  object 
 9   program_id  847330 non-null  int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 71.1+ MB


In [14]:
df.describe()

Unnamed: 0,user_id,cohort_id,id,program_id
count,847330.0,847330.0,847330.0,847330.0
mean,456.707344,48.501049,48.501049,2.086004
std,250.734201,32.795482,32.795482,0.388231
min,1.0,1.0,1.0,1.0
25%,263.0,28.0,28.0,2.0
50%,476.0,33.0,33.0,2.0
75%,648.0,57.0,57.0,2.0
max,981.0,139.0,139.0,4.0


In [15]:
df.isnull().sum()

path          1
user_id       0
cohort_id     0
ip            0
id            0
name          0
slack         0
start_date    0
end_date      0
program_id    0
dtype: int64

In [16]:
df['cohort_id'] = df.cohort_id.fillna(0)


In [17]:
df.isnull().sum()

path          1
user_id       0
cohort_id     0
ip            0
id            0
name          0
slack         0
start_date    0
end_date      0
program_id    0
dtype: int64

In [18]:
for col in df.columns:
    print(col)
    print(df[col].value_counts(),'\n')

path
/                                      45854
javascript-i                           18203
toc                                    17591
search/search_index.json               17534
java-iii                               13166
                                       ...  
html-css/grid/css-grid-fundamentals        1
2.03.03_Power                              1
slidesexceptions_and_errorhandlings        1
jacascript-i/introdection                  1
capstone/53                                1
Name: path, Length: 2224, dtype: int64 

user_id
11     17913
64     16322
53     12329
314     7783
1       7404
       ...  
952        1
649        1
940        1
163        1
652        1
Name: user_id, Length: 911, dtype: int64 

cohort_id
28.0     84031
33.0     40730
29.0     38096
62.0     37109
53.0     36902
24.0     35636
57.0     33844
56.0     33568
51.0     32888
59.0     32015
22.0     30926
58.0     29855
32.0     29356
23.0     28534
52.0     28033
26.0     27749
34.0     26538


In [31]:
john = df[(df.ip == '99.43.137.186')]

In [32]:
john.head()

Unnamed: 0,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,program_id
2020-12-08 10:49:39,/,836,133.0,99.43.137.186,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 10:58:00,/,836,133.0,99.43.137.186,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 11:01:11,fundamentals/intro-to-data-science,836,133.0,99.43.137.186,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 11:01:11,fundamentals/AI-ML-DL-timeline.jpg,836,133.0,99.43.137.186,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 11:01:11,fundamentals/modern-data-scientist.jpg,836,133.0,99.43.137.186,133,Easley,#easley,2020-12-07,2021-06-08,3


In [33]:
john.shape

(795, 10)

In [34]:
easley = df[df['name'] == 'Easley']

In [35]:
easley.head()

Unnamed: 0,path,user_id,cohort_id,ip,id,name,slack,start_date,end_date,program_id
2020-12-08 10:49:37,/,835,133.0,173.173.109.5,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 10:49:39,/,836,133.0,99.43.137.186,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 10:49:40,/,837,133.0,66.69.79.82,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 10:49:52,/,838,133.0,99.158.249.67,133,Easley,#easley,2020-12-07,2021-06-08,3
2020-12-08 10:50:16,/,839,133.0,130.45.49.89,133,Easley,#easley,2020-12-07,2021-06-08,3


In [36]:
easley.shape

(14715, 10)

In [46]:
easley.user_id.value_counts()

845    1891
841    1414
844    1152
837    1131
840    1018
835     974
847     909
143     808
838     798
836     795
851     778
849     744
848     653
842     608
843     550
846     400
839      92
Name: user_id, dtype: int64

In [44]:
easley.user_id.value_counts().resample('D').plot()

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Int64Index'