In [45]:
import wrangle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

pd.reset_option('display.max_rows')

In [2]:
df = wrangle.get_curriculum_logs_data()
df.dropna(inplace = True)
df.head()

The df has 847330 rows and 10 columns.


Unnamed: 0,cohortid,start_date,end_date,cohortname,program_id,date,time,path,user_id,ip,program_name,is_staff
0,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:47,/,11,97.105.19.61,web_dev,False
1,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:51,javascript-i,11,97.105.19.61,web_dev,False
2,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:15:02,javascript-i/functions,11,97.105.19.61,web_dev,False
3,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,11:46:29,appendix/further-reading/javascript/hoisting.html,11,97.105.19.61,web_dev,False
4,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,11:46:39,appendix,11,97.105.19.61,web_dev,False


### Questions to Explore...
- Q1: Which lesson appears to attract the most traffic consistently across cohorts (per program)?
- Q2: Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
- Q7: Which lessons are least accessed?
<hr>
- Q5: At some point in 2019, the ability for students and alumni to access both curriculums (web dev to ds, ds to web dev) should have been shut off. Do you see any evidence of that happening? Did it happen before?
- Q8: Anything else I should be aware of?

## Q1: Which lesson appears to attract the most traffic consistently across cohorts (per program)?
In order to answer this question I will start by exploring the values in the `path` column.

In [13]:
df.path.value_counts()

/                                                 45854
javascript-i                                      18203
toc                                               17591
search/search_index.json                          17534
java-iii                                          13166
                                                  ...  
5-stats                                               1
joins                                                 1
10.00_Intro.html                                      1
content/examples/constructors-destructors.html        1
7-clustering/dbscan                                   1
Name: path, Length: 2224, dtype: int64

This gives me a pretty good idea of the lessons from each log.
#### Now I want to be able to look at `path` values across `program_name` . . .

In [28]:
df.head(2)

Unnamed: 0,cohortid,start_date,end_date,cohortname,program_id,date,time,path,user_id,ip,program_name,is_staff
0,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:47,/,11,97.105.19.61,web_dev,False
1,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:51,javascript-i,11,97.105.19.61,web_dev,False


In [43]:
# looking at the path value_counts by cohorts, starting with web_dev
df[(df.path != '/') & (df.program_name == 'web_dev')].path.value_counts().head(15)

javascript-i                                                                 18193
toc                                                                          17580
search/search_index.json                                                     15331
java-iii                                                                     13162
html-css                                                                     13111
java-ii                                                                      12173
spring                                                                       11877
jquery                                                                       11037
mysql                                                                        10602
java-i                                                                       10460
javascript-ii                                                                10290
appendix                                                                      8057
java

#### I am going to create a DataFrame of the `path` and `path_counts` for the web_dev program and then concat with the top n `path` value_counts( ) for data_science

In [63]:
# creating web_dev paths and path count df for top x10 value_counts()
web_dev_paths = pd.DataFrame(df[(df.path != '/') & (df.program_name == 'web_dev')].path.value_counts().head(10))\
                        .reset_index().rename(columns = {'index': 'wd_path', 'path': 'wd_path_count'})
web_dev_paths

Unnamed: 0,wd_path,wd_path_count
0,javascript-i,18193
1,toc,17580
2,search/search_index.json,15331
3,java-iii,13162
4,html-css,13111
5,java-ii,12173
6,spring,11877
7,jquery,11037
8,mysql,10602
9,java-i,10460


In [57]:
# the values and value_counts() for the top n data_science paths
ds_paths = df[(df.path != '/') & (df.program_name == 'data_science')].path.value_counts().head(10)
ds_paths

search/search_index.json                    2203
classification/overview                     1785
1-fundamentals/modern-data-scientist.jpg    1655
1-fundamentals/AI-ML-DL-timeline.jpg        1651
1-fundamentals/1.1-intro-to-data-science    1633
classification/scale_features_or_not.svg    1590
fundamentals/AI-ML-DL-timeline.jpg          1443
fundamentals/modern-data-scientist.jpg      1438
sql/mysql-overview                          1424
fundamentals/intro-to-data-science          1413
Name: path, dtype: int64

In [64]:
pd.concat([web_dev_paths, pd.DataFrame(ds_paths).reset_index()\
           .rename(columns = {'index':'ds_path', 'path':'ds_path_count'})], axis = 1)

Unnamed: 0,wd_path,wd_path_count,ds_path,ds_path_count
0,javascript-i,18193,search/search_index.json,2203
1,toc,17580,classification/overview,1785
2,search/search_index.json,15331,1-fundamentals/modern-data-scientist.jpg,1655
3,java-iii,13162,1-fundamentals/AI-ML-DL-timeline.jpg,1651
4,html-css,13111,1-fundamentals/1.1-intro-to-data-science,1633
5,java-ii,12173,classification/scale_features_or_not.svg,1590
6,spring,11877,fundamentals/AI-ML-DL-timeline.jpg,1443
7,jquery,11037,fundamentals/modern-data-scientist.jpg,1438
8,mysql,10602,sql/mysql-overview,1424
9,java-i,10460,fundamentals/intro-to-data-science,1413


>## A1: `json` and `SQL` appear in the top 10 paths across both programs.
- `json`, #3 for web_dev, #1 for data_science 
- `SQL`, #9 across both the `web_dev` and `data_science` programs
<br>

Need to think of a way to clean up the path data and visualize this...

## Q2: Is there a cohort that referred to a lesson significantly more than other cohorts seemed to gloss over?
To answer this question I will continue looking at the `path` values but will now look at them across `cohortname`

In [65]:
df.head()

Unnamed: 0,cohortid,start_date,end_date,cohortname,program_id,date,time,path,user_id,ip,program_name,is_staff
0,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:47,/,11,97.105.19.61,web_dev,False
1,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:51,javascript-i,11,97.105.19.61,web_dev,False
2,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:15:02,javascript-i/functions,11,97.105.19.61,web_dev,False
3,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,11:46:29,appendix/further-reading/javascript/hoisting.html,11,97.105.19.61,web_dev,False
4,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,11:46:39,appendix,11,97.105.19.61,web_dev,False


In [96]:
# pd.set_option('display.max_columns', None)
prog_name = pd.concat([df, pd.get_dummies(df.cohortname)], axis = 1)
prog_name.head()

Unnamed: 0,cohortid,start_date,end_date,cohortname,program_id,date,time,path,user_id,ip,program_name,is_staff,Andromeda,Apex,Apollo,Arches,Badlands,Bash,Bayes,Betelgeuse,Ceres,Curie,Darden,Deimos,Denali,Easley,Europa,Everglades,Florence,Fortuna,Franklin,Ganymede,Glacier,Hampton,Hyperion,Ike,Joshua,Jupiter,Kalypso,Kings,Lassen,Luna,Mammoth,Marco,Neptune,Niagara,Oberon,Olympic,Pinnacles,Quincy,Sequoia,Staff,Teddy,Ulysses,Voyageurs,Wrangell,Xanadu,Yosemite,Zion
0,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:47,/,11,97.105.19.61,web_dev,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:14:51,javascript-i,11,97.105.19.61,web_dev,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,10:15:02,javascript-i/functions,11,97.105.19.61,web_dev,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,11:46:29,appendix/further-reading/javascript/hoisting.html,11,97.105.19.61,web_dev,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,2014-02-04,2014-04-22,Arches,1,2018-01-26,11:46:39,appendix,11,97.105.19.61,web_dev,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [85]:
# top path counts across all cohorts
df.path.value_counts().sort_values(ascending = False)

/                                   45854
javascript-i                        18203
toc                                 17591
search/search_index.json            17534
java-iii                            13166
                                    ...  
spring/services                         1
student/202                             1
python/introduction-to-python.md        1
understand                              1
7-clustering/dbscan                     1
Name: path, Length: 2224, dtype: int64

In [97]:
prog_name[['path', 'cohortname']].groupby('path').agg('count').sort_values(by = 'cohortname', ascending = False)

Unnamed: 0_level_0,cohortname
path,Unnamed: 1_level_1
/,45854
javascript-i,18203
toc,17591
search/search_index.json,17534
java-iii,13166
...,...
end_to_end_clustering,1
essential-methods/traversing,1
evaluate,1
examples/bootstrap/pizza.pn,1


In [122]:
pd.concat([df.path, pd.get_dummies(df.cohortname)], axis = 1).groupby('path').sum()

Unnamed: 0_level_0,Andromeda,Apex,Apollo,Arches,Badlands,Bash,Bayes,Betelgeuse,Ceres,Curie,Darden,Deimos,Denali,Easley,Europa,Everglades,Florence,Fortuna,Franklin,Ganymede,Glacier,Hampton,Hyperion,Ike,Joshua,Jupiter,Kalypso,Kings,Lassen,Luna,Mammoth,Marco,Neptune,Niagara,Oberon,Olympic,Pinnacles,Quincy,Sequoia,Staff,Teddy,Ulysses,Voyageurs,Wrangell,Xanadu,Yosemite,Zion
path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
/,1174.0,1346.0,1.0,626.0,17.0,772.0,1967.0,955.0,1653.0,1712.0,2980.0,1467.0,1.0,1115.0,1295.0,1.0,584.0,2038.0,4.0,1727.0,51.0,210.0,1365.0,19.0,8.0,1696.0,939.0,219.0,125.0,491.0,33.0,699.0,472.0,37.0,131.0,249.0,149.0,151.0,630.0,6340.0,1828.0,1641.0,2101.0,1132.0,924.0,981.0,1798.0
content/php_ii/command-line,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,12.0,1.0,2.0,0.0,0.0,8.0,2.0,0.0,1.0,0.0,0.0,0.0
content/php_ii/control-structures-i,4.0,0.0,0.0,1.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,15.0,2.0,5.0,1.0,1.0,4.0,3.0,1.0,2.0,0.0,0.0,0.0
content/php_ii/command-line/io-streams.html,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,4.0,0.0,0.0,0.0,0.0,7.0,1.0,1.0,0.0,0.0,5.0,2.0,0.0,1.0,0.0,0.0,0.0
content/php_i,13.0,0.0,0.0,2.0,5.0,0.0,0.0,28.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,3.0,0.0,2.0,6.0,0.0,0.0,37.0,19.0,0.0,9.0,0.0,0.0,0.0,0.0,32.0,3.0,11.0,3.0,11.0,17.0,29.0,21.0,9.0,7.0,9.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RESTful-api,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Probability.md,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PreWork/PreWork/cli,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Pipeline_Demo,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,43.0,5.0,0.0,6.0,3.0,0.0,0.0,8.0


In [84]:
# bottom path counts across all cohorts
df.path.value_counts().sort_values()

7-clustering/dbscan                     1
understand                              1
python/introduction-to-python.md        1
student/202                             1
spring/services                         1
                                    ...  
java-iii                            13166
search/search_index.json            17534
toc                                 17591
javascript-i                        18203
/                                   45854
Name: path, Length: 2224, dtype: int64