# Statistics for Entity Project Page Views

In [1]:
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
entity_project_page <- read.table("../results/entity_project_page_views.tsv", header=FALSE, sep="\t")

In [3]:
entity_views <- read.table("../results/entity_views.tsv", header=FALSE, sep="\t")

In [4]:
entity_project_page_views <- entity_project_page

## Correlation between Usages and Views

In [5]:
colnames(entity_project_page_views) <- c('entity_id','project','page_id','page_views')

In [6]:
summary(entity_project_page_views)

     entity_id                project            page_id        
 Q54919   :  2451812   commonswiki:33551149   Min.   :       0  
 Q423048  :  1968766   ruwiki     :24499918   1st Qu.:  662198  
 Q2597810 :  1937505   zhwiki     : 8302855   Median : 2578039  
 Q131454  :  1924192   enwiki     : 7988037   Mean   : 8250467  
 Q13219454:  1924125   svwiki     : 5961984   3rd Qu.: 6896287  
 Q36578   :  1917354   frwiki     : 5685580   Max.   :58444341  
 (Other)  :148679703   (Other)    :74813934                     
   page_views       
 Min.   :0.000e+00  
 1st Qu.:5.000e+00  
 Median :5.700e+01  
 Mean   :4.160e+03  
 3rd Qu.:5.160e+02  
 Max.   :8.888e+09  
                    

In [7]:
page_frequency_by_entity <- count(entity_project_page_views,entity_id)

In [8]:
colnames(page_frequency_by_entity) <- c('entity_id','page_usages')

In [9]:
summary(page_frequency_by_entity)

   entity_id         page_usages       
 P1     :       1   Min.   :      1.0  
 P10    :       1   1st Qu.:      1.0  
 P100   :       1   Median :      1.0  
 P1000  :       1   Mean   :      7.2  
 P10000 :       1   3rd Qu.:      3.0  
 P1001  :       1   Max.   :2451812.0  
 (Other):22250015                      

In [10]:
colnames(entity_views) <- c('entity_id','page_views')

In [11]:
entity_views_usage_frequency <- merge(entity_views, page_frequency_by_entity, by = "entity_id")

In [12]:
summary(entity_views_usage_frequency)

   entity_id          page_views         page_usages       
 P1     :       1   Min.   :0.000e+00   Min.   :      1.0  
 P10    :       1   1st Qu.:1.300e+01   1st Qu.:      1.0  
 P100   :       1   Median :1.360e+02   Median :      1.0  
 P1000  :       1   Mean   :3.006e+04   Mean   :      7.2  
 P10000 :       1   3rd Qu.:9.970e+02   3rd Qu.:      3.0  
 P1001  :       1   Max.   :1.253e+10   Max.   :2451812.0  
 (Other):22250015                                          

In [13]:
colnames(entity_views_usage_frequency) <- c('entity_id','page_views','page_usages')

In [14]:
head(entity_views_usage_frequency)

entity_id,page_views,page_usages
P1,1345,11
P10,26612,424
P100,1727,9
P1000,45259,287
P10000,331,2
P1001,6911712,1006


In [15]:
cor(entity_views_usage_frequency$page_views,entity_views_usage_frequency$page_usages, method="spearman")

In [16]:
page_view_and_usage_model <- lm(entity_views_usage_frequency$page_views ~ entity_views_usage_frequency$page_usages)

In [17]:
summary(page_view_and_usage_model)


Call:
lm(formula = entity_views_usage_frequency$page_views ~ entity_views_usage_frequency$page_usages)

Residuals:
       Min         1Q     Median         3Q        Max 
-3.983e+09 -1.900e+04 -1.688e+04 -1.656e+04  1.253e+10 

Coefficients:
                                          Estimate Std. Error t value Pr(>|t|)
(Intercept)                              14768.502   1189.421   12.42   <2e-16
entity_views_usage_frequency$page_usages  2116.537      0.803 2635.88   <2e-16
                                            
(Intercept)                              ***
entity_views_usage_frequency$page_usages ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5610000 on 22250019 degrees of freedom
Multiple R-squared:  0.238,	Adjusted R-squared:  0.238 
F-statistic: 6.948e+06 on 1 and 22250019 DF,  p-value: < 2.2e-16


In [None]:
entity_view_means <- aggregate(entity_project_page_views$page_views, by=list(entity_project_page_views$entity_id), FUN=mean)

In [None]:
summary(entity_view_means)

In [None]:
colnames(entity_view_means) <- c('entity_id','page_view_mean')

In [None]:
entity_view_means_and_usages <- merge(page_frequency_by_entity, entity_view_means, by = "entity_id")

In [None]:
rm(page_frequency_by_entity)

In [None]:
rm(entity_view_means)

In [None]:
head(entity_view_means_and_usages)

In [None]:
cor(entity_view_means_and_usages$page_usages,entity_view_means_and_usages$page_view_mean, method="spearman")

In [None]:
entity_view_means_and_usages_model <- lm(entity_view_means_and_usages$page_usages ~ entity_view_means_and_usages$page_view_mean)

In [None]:
summary(entity_view_means_and_usages_model)

### Sorted by highest view means

In [None]:
sorted_by_highest_view_means <- dplyr::arrange(entity_view_means_and_usages, desc(page_view_mean))

In [None]:
head(sorted_by_highest_view_means, n=10)

### Sorted by lowest view means

In [None]:
view_means_of_zero <-filter(entity_view_means_and_usages, page_view_mean == 0)

In [None]:
view_means_of_zero_sorted_by_page_usages <- dplyr::arrange(view_means_of_zero, desc(page_usages))

In [None]:
head(view_means_of_zero_sorted_by_page_usages, n=10)

## Entities used once versus more than once 

In [None]:
used_once <- filter(entity_view_means_and_usages, page_usages == 1 ) 

In [None]:
used_more_than_once <- filter(entity_view_means_and_usages, page_usages > 0 ) 

In [None]:
mean(used_once$page_view_mean)

In [None]:
mean(used_more_than_once$page_view_mean)