# Statistics for Entity Project Page Views

In [1]:
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
entity_project_page <- read.table("../results/entity_project_page_views.tsv", header=FALSE, sep="\t")

In [3]:
entity_views <- read.table("../results/entity_views.tsv", header=FALSE, sep="\t")

In [4]:
entity_project_page_views <- entity_project_page

## Correlation between Usages and Views

In [5]:
colnames(entity_project_page_views) <- c('entity_id','project','page_id','page_views')

In [6]:
summary(entity_project_page_views)

     entity_id                project            page_id        
 Q54919   :  2451812   commonswiki:33551149   Min.   :       0  
 Q423048  :  1968766   ruwiki     :24499918   1st Qu.:  662198  
 Q2597810 :  1937505   zhwiki     : 8302855   Median : 2578039  
 Q131454  :  1924192   enwiki     : 7988037   Mean   : 8250467  
 Q13219454:  1924125   svwiki     : 5961984   3rd Qu.: 6896287  
 Q36578   :  1917354   frwiki     : 5685580   Max.   :58444341  
 (Other)  :148679703   (Other)    :74813934                     
   page_views       
 Min.   :0.000e+00  
 1st Qu.:5.000e+00  
 Median :5.700e+01  
 Mean   :4.160e+03  
 3rd Qu.:5.160e+02  
 Max.   :8.888e+09  
                    

In [7]:
page_frequency_by_entity <- count(entity_project_page_views,entity_id)

In [8]:
colnames(page_frequency_by_entity) <- c('entity_id','page_usages')

In [9]:
summary(page_frequency_by_entity)

   entity_id         page_usages       
 P1     :       1   Min.   :      1.0  
 P10    :       1   1st Qu.:      1.0  
 P100   :       1   Median :      1.0  
 P1000  :       1   Mean   :      7.2  
 P10000 :       1   3rd Qu.:      3.0  
 P1001  :       1   Max.   :2451812.0  
 (Other):22250015                      

In [10]:
colnames(entity_views) <- c('entity_id','page_views')

In [11]:
entity_views_usage_frequency <- merge(entity_views, page_frequency_by_entity, by = "entity_id")

In [12]:
summary(entity_views_usage_frequency)

   entity_id          page_views         page_usages       
 P1     :       1   Min.   :0.000e+00   Min.   :      1.0  
 P10    :       1   1st Qu.:1.300e+01   1st Qu.:      1.0  
 P100   :       1   Median :1.360e+02   Median :      1.0  
 P1000  :       1   Mean   :3.006e+04   Mean   :      7.2  
 P10000 :       1   3rd Qu.:9.970e+02   3rd Qu.:      3.0  
 P1001  :       1   Max.   :1.253e+10   Max.   :2451812.0  
 (Other):22250015                                          

In [13]:
colnames(entity_views_usage_frequency) <- c('entity_id','page_views','page_usages')

In [14]:
head(entity_views_usage_frequency)

entity_id,page_views,page_usages
P1,1345,11
P10,26612,424
P100,1727,9
P1000,45259,287
P10000,331,2
P1001,6911712,1006


In [15]:
cor(entity_views_usage_frequency$page_views,entity_views_usage_frequency$page_usages, method="spearman")

In [16]:
page_view_and_usage_model <- lm(entity_views_usage_frequency$page_views ~ entity_views_usage_frequency$page_usages)

In [17]:
summary(page_view_and_usage_model)


Call:
lm(formula = entity_views_usage_frequency$page_views ~ entity_views_usage_frequency$page_usages)

Residuals:
       Min         1Q     Median         3Q        Max 
-3.983e+09 -1.900e+04 -1.688e+04 -1.656e+04  1.253e+10 

Coefficients:
                                          Estimate Std. Error t value Pr(>|t|)
(Intercept)                              14768.502   1189.421   12.42   <2e-16
entity_views_usage_frequency$page_usages  2116.537      0.803 2635.88   <2e-16
                                            
(Intercept)                              ***
entity_views_usage_frequency$page_usages ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5610000 on 22250019 degrees of freedom
Multiple R-squared:  0.238,	Adjusted R-squared:  0.238 
F-statistic: 6.948e+06 on 1 and 22250019 DF,  p-value: < 2.2e-16


In [18]:
entity_view_means <- aggregate(entity_project_page_views$page_views, by=list(entity_project_page_views$entity_id), FUN=mean)

In [19]:
summary(entity_view_means)

    Group.1               x           
 P1     :       1   Min.   :       0  
 P10    :       1   1st Qu.:       9  
 P100   :       1   Median :      83  
 P1000  :       1   Mean   :    1527  
 P10000 :       1   3rd Qu.:     560  
 P1001  :       1   Max.   :20441060  
 (Other):22250015                     

In [20]:
colnames(entity_view_means) <- c('entity_id','page_view_mean')

In [21]:
entity_view_means_and_usages <- merge(page_frequency_by_entity, entity_view_means, by = "entity_id")

In [22]:
rm(page_frequency_by_entity)

In [23]:
rm(entity_view_means)

In [24]:
head(entity_view_means_and_usages)

entity_id,page_usages,page_view_mean
P1,11,122.27273
P10,424,62.76415
P100,9,191.88889
P1000,287,157.69686
P10000,2,165.5
P1001,1006,6870.48907


In [25]:
cor(entity_view_means_and_usages$page_usages,entity_view_means_and_usages$page_view_mean, method="spearman")

In [26]:
entity_view_means_and_usages_model <- lm(entity_view_means_and_usages$page_usages ~ entity_view_means_and_usages$page_view_mean)

In [27]:
summary(entity_view_means_and_usages_model)


Call:
lm(formula = entity_view_means_and_usages$page_usages ~ entity_view_means_and_usages$page_view_mean)

Residuals:
    Min      1Q  Median      3Q     Max 
  -2585      -6      -6      -4 2451805 

Coefficients:
                                             Estimate Std. Error t value
(Intercept)                                 6.989e+00  3.170e-01  22.045
entity_view_means_and_usages$page_view_mean 1.561e-04  2.844e-05   5.488
                                            Pr(>|t|)    
(Intercept)                                  < 2e-16 ***
entity_view_means_and_usages$page_view_mean 4.06e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1481 on 22250019 degrees of freedom
Multiple R-squared:  1.354e-06,	Adjusted R-squared:  1.309e-06 
F-statistic: 30.12 on 1 and 22250019 DF,  p-value: 4.06e-08


### Sorted by highest view means

In [28]:
sorted_by_highest_view_means <- dplyr::arrange(entity_view_means_and_usages, desc(page_view_mean))

In [29]:
head(sorted_by_highest_view_means, n=10)

entity_id,page_usages,page_view_mean
Q5296,613,20441060
Q4652632,5,8667144
Q8042151,5,8018726
Q22713189,1,4983678
Q26769076,2,4659126
Q25136249,3,4328533
Q5589097,1,4306323
Q25999140,2,3983256
Q3546230,1,3974205
Q26832572,1,3847138


### Sorted by lowest view means

In [30]:
view_means_of_zero <-filter(entity_view_means_and_usages, page_view_mean == 0)

In [31]:
view_means_of_zero_sorted_by_page_usages <- dplyr::arrange(view_means_of_zero, desc(page_usages))

In [32]:
head(view_means_of_zero_sorted_by_page_usages, n=10)

entity_id,page_usages,page_view_mean
Q14334317,6,0
Q14960107,6,0
Q14963100,6,0
Q14964406,6,0
Q14967290,6,0
Q14967382,6,0
Q14969075,6,0
Q14970322,6,0
Q14970334,6,0
Q14970538,6,0


## Entities used once versus more than once 

In [33]:
used_once <- filter(entity_view_means_and_usages, page_usages == 1 ) 

In [34]:
used_more_than_once <- filter(entity_view_means_and_usages, page_usages > 0 ) 

In [35]:
mean(used_once$page_view_mean)

In [36]:
mean(used_more_than_once$page_view_mean)