In [None]:
import urllib.request
import gzip

In [1]:
localfile = 'pageviews-20190331-120000'

In [None]:
url = 'https://dumps.wikimedia.org/other/pageviews/2019/2019-03/pageviews-20190331-120000.gz'

r = urllib.request.urlopen(url)
s_out = gzip.decompress(r.read())

with open(localfile, 'wb') as f:  
    f.write(s_out)

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, StructField, IntegerType

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
schema = StructType([StructField('code', StringType(), True),
                     StructField('title', StringType(), True),
                     StructField('nbrequest', IntegerType(), True),
                     StructField('total', IntegerType(), True)])

df = spark.read.csv(localfile, sep=' ', header=None, schema=schema)


In [5]:
df.createOrReplaceTempView("wikipedia")

In [14]:
# Afficher les 10 elements les plus vus
df.sort('nbrequest', ascending=False).show(10)

+------+--------------------+---------+-----+
|  code|               title|nbrequest|total|
+------+--------------------+---------+-----+
|    en|           Main_Page|   643391|    0|
|  en.m|           Main_Page|    94291|    0|
|www.wd|                   -|    50556|    0|
|    en|      Special:Search|    45840|    0|
|    ru|Borderlands:_The_...|    33236|    0|
|    de|Wikipedia:Hauptseite|    29461|    0|
|    en|                   -|    26240|    0|
|  en.m|      Special:Search|    24198|    0|
|www.wd|   Special:BlankPage|    21642|    0|
|  de.m|Wikipedia:Hauptseite|    21173|    0|
+------+--------------------+---------+-----+
only showing top 10 rows



In [12]:
# Afficher les 10 elements francais les plus vus
spark.sql("SELECT * FROM WIKIPEDIA WHERE CODE RLIKE \'^fr\' ORDER BY NBREQUEST DESC").show(10)

+----+--------------------+---------+-----+
|code|               title|nbrequest|total|
+----+--------------------+---------+-----+
|  fr|Wikipédia:Accueil...|    11597|    0|
|fr.m|Wikipédia:Accueil...|     9733|    0|
|fr.m|      Jacques_Chirac|     6057|    0|
|  fr|   Spécial:Recherche|     4460|    0|
|fr.m|   Bernadette_Chirac|     3576|    0|
|fr.m|                   -|     3090|    0|
|fr.m|       Claude_Chirac|     3065|    0|
|fr.m|   Spécial:Recherche|     2409|    0|
|fr.m| Plateau_des_Glières|     1761|    0|
|fr.m|         Edgar_Morin|     1381|    0|
+----+--------------------+---------+-----+
only showing top 10 rows



In [17]:
# Enlever les titres avec characteres speciaux (:)
spark.sql("SELECT * FROM WIKIPEDIA WHERE CODE RLIKE \'^fr\' AND NOT TITLE LIKE \'%:%\' ORDER BY NBREQUEST DESC").show(10)

+----+--------------------+---------+-----+
|code|               title|nbrequest|total|
+----+--------------------+---------+-----+
|fr.m|      Jacques_Chirac|     6057|    0|
|fr.m|   Bernadette_Chirac|     3576|    0|
|fr.m|                   -|     3090|    0|
|fr.m|       Claude_Chirac|     3065|    0|
|fr.m| Plateau_des_Glières|     1761|    0|
|fr.m|         Edgar_Morin|     1381|    0|
|fr.m|         Agnès_Varda|     1277|    0|
|fr.m|Charles_Leclerc_(...|     1166|    0|
|fr.m|         Heure_d'été|     1121|    0|
|fr.m|     Bonnie_et_Clyde|     1092|    0|
+----+--------------------+---------+-----+
only showing top 10 rows

