In [1]:
pip install jenkspy

Collecting jenkspy
  Downloading jenkspy-0.2.0-cp37-cp37m-win_amd64.whl (43 kB)
Installing collected packages: jenkspy
Successfully installed jenkspy-0.2.0
Note: you may need to restart the kernel to use updated packages.


## Using Jenks Natural Breaks to segment the timespent feature 
### Jenks Natural Breaks minimizes the variance within clusters and maximizes the variance between them

In [174]:
import jenkspy
import pandas as pd

In [175]:
userRatingsTable = pd.read_csv('../datasets/UserRatingData.csv')

In [176]:
userRatingsTable.head()

Unnamed: 0,Article,Time Spent,User
0,https://www.google.com/search?q=toronto+raptor...,-14,cac374ab-932a-43db-9b8e-d4eb930adff4
1,https://www.google.com/search?q=larry+o%27bri...,-11,cac374ab-932a-43db-9b8e-d4eb930adff4
2,https://en.wikipedia.org/wiki/Larry_O%27Brien...,-6,cac374ab-932a-43db-9b8e-d4eb930adff4
3,https://en.wikipedia.org/wiki/Larry_O'Brien_C...,-11,cac374ab-932a-43db-9b8e-d4eb930adff4
4,chrome://new...,-34,cac374ab-932a-43db-9b8e-d4eb930adff4


In [179]:
# converting the time spent column to float then to int so that it can be sorted
userRatingsTable["Time Spent"] = userRatingsTable["Time Spent"].astype(float)
userRatingsTable["TimeSpent_positive"] = userRatingsTable["Time Spent"]*-1


In [180]:
userRatingsTable = userRatingsTable.dropna()
userRatingsTable["TimeSpent_positive"] = userRatingsTable["TimeSpent_positive"].astype(int)
userRatingsTable.dtypes

Article                object
Time Spent            float64
User                   object
TimeSpent_positive      int32
dtype: object

In [182]:
# check to make sure there are no more null values
userRatingsTable["TimeSpent_positive"].isnull().values.any()

False

In [183]:
userRatingsTable.head()

Unnamed: 0,Article,Time Spent,User,TimeSpent_positive
0,https://www.google.com/search?q=toronto+raptor...,-14.0,cac374ab-932a-43db-9b8e-d4eb930adff4,14
1,https://www.google.com/search?q=larry+o%27bri...,-11.0,cac374ab-932a-43db-9b8e-d4eb930adff4,11
2,https://en.wikipedia.org/wiki/Larry_O%27Brien...,-6.0,cac374ab-932a-43db-9b8e-d4eb930adff4,6
3,https://en.wikipedia.org/wiki/Larry_O'Brien_C...,-11.0,cac374ab-932a-43db-9b8e-d4eb930adff4,11
4,chrome://new...,-34.0,cac374ab-932a-43db-9b8e-d4eb930adff4,34


In [184]:
# sort by time spent
userRatingsTable = userRatingsTable.sort_values(by="TimeSpent_positive")

In [185]:
userRatingsTable.head()

Unnamed: 0,Article,Time Spent,User,TimeSpent_positive
126,https://en.wikipedia.org/wiki/Joe_B...,0.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,0
136,https://en.wikipedia.org/wiki/Joe_B...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1
133,https://en.wikipedia.org/wiki/Donald_T...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1
129,https://en.wikipedia.org/wiki/Barack_O...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1
87,https://en.wikipedia.org/wiki/A...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1


### implement Jenks Natural Breaks

In [186]:
# first determine where the natural breaks are 
breaks = jenkspy.jenks_breaks(userRatingsTable["TimeSpent_positive"], nb_class = 3)
print(breaks)

[0.0, 104.0, 296.0, 546.0]


In [187]:
# create new column in the df to indicate low, medium, high rankings for time spent
userRatingsTable["TimeSpent_Ratings"] = pd.cut(userRatingsTable["TimeSpent_positive"], 
                                               bins = breaks, 
                                               labels = ['low', 'med', 'high'],
                                              include_lowest=True)

In [171]:
pd.set_option('display.max_rows', 600)

In [189]:
pd.options.display.max_rows

600

In [190]:
userRatingsTable.head(600)

Unnamed: 0,Article,Time Spent,User,TimeSpent_positive,TimeSpent_Ratings
126,https://en.wikipedia.org/wiki/Joe_B...,0.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,0,low
136,https://en.wikipedia.org/wiki/Joe_B...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
133,https://en.wikipedia.org/wiki/Donald_T...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
129,https://en.wikipedia.org/wiki/Barack_O...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
87,https://en.wikipedia.org/wiki/A...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
123,https://en.wikipedia.org/wiki/Donald_T...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
121,https://en.wikipedia.org/wiki/Barack_O...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
119,https://en.wikipedia.org/wiki/Pucca_hou...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
115,https://en.wikipedia.org/wiki/2018_Pan_Americ...,-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low
109,"https://en.wikipedia.org/wiki/Wygnanka,_Radzy...",-1.0,10d64c24-18fa-49a7-bd0b-ced0e5995660,1,low


### next considerations
#### instead of using L, M, H, just input ratings 1, 2 ,3 so that we can convert to int and use as rating for matrix factorization
#### dataset is really skewed right now because we haven't actually been reading and scrolling through articles 