In [1]:
pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-0.15.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.15.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

# Connexion à la base de données Mongo 

In [3]:
from pyspark.sql import SparkSession

spark = (SparkSession
    .builder
    .appName("myApp2") 
    .config("spark.mongodb.input.uri", os.getenv('MONGO_URL')) 
    .config("spark.mongodb.input.collection", 'data')  
    .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")
    .getOrCreate())

In [4]:
df = spark.read.format("mongo").load()

## Nombre total de derniers articles

In [5]:
df.count()

630

## Les 10 derniers articles

In [6]:
latest_articles = df.take(10)
latest_articles

[Row(_id=Row(oid='601c95daff704249dce5ff76'), author='Catherine Garcia', category=['general'], description='The House voted on Thursday to strip Rep. Marjorie Taylor Greene (R-Ga.) of her committee posts, over inflammatory and violent remarks she has made and supported.The vote was 230-199, with 11 Republicans joining Democrats to kick Greene off the Education and Labor and Budget committees. Greene has ...', id='b1e1657e-30a4-43a1-b425-c6f24b9332d6', image='None', language='en', origin='theweek.com', published=datetime.datetime(2021, 2, 5, 0, 18, 26), title='House votes to eject Marjorie Taylor Greene from committees', url='https://theweek.com/speedreads/965147/house-votes-eject-marjorie-taylor-greene-from-committees'),
 Row(_id=Row(oid='601c95daff704249dce5ff77'), author='Kenneth C. Crowe II', category=['regional', 'albany', 'new-york'], description='COHOES  Delivering his second state of the city address virtually Thursday, Mayor Bill...', id='cc17ec6c-3af9-4ba0-84b7-48361ddbc0fb', 

In [27]:
df.select('image').take(10)[1].image

'https://s.hdnux.com/photos/01/12/31/54/19503916/3/rawImage.jpg'

## Les auteurs avec leurs catégories

In [7]:
spark.catalog.dropTempView("article_table")
df.createTempView("article_table")
spark.sql("SELECT DISTINCT(author), category FROM article_table").collect()

[Row(author='@hotairblog', category=['politics']),
 Row(author='Valarie Honeycutt Spears', category=['kentucky']),
 Row(author='Leah Romero', category=['general']),
 Row(author='Alexander Popov', category=['programming']),
 Row(author='<a href="http://arxiv.org/find/stat/1/au:+Wang_T/0/1/0/all/0/1">Tianyu Wang</a>, <a href="http://arxiv.org/find/stat/1/au:+Morucci_M/0/1/0/all/0/1">Marco Morucci</a>, <a href="http://arxiv.org/find/stat/1/au:+Awan_M/0/1/0/all/0/1">M. Usaid Awan</a>, <a href="http://arxiv.org/find/stat/1/au:+Liu_Y/0/', category=['academic', 'CS', 'DB']),
 Row(author='Adam Miller', category=['regional']),
 Row(author='[{"@type":"Person","name":"Kit Stone","url":"https://www.buzzfeed.com/bykitstone","jobTitle":"BuzzFeed Staff"}]', category=['lifestyle']),
 Row(author='Tyler Sonnemaker', category=['business']),
 Row(author='Dustin Jones', category=['health', 'lifestyle']),
 Row(author='Jessica Yun', category=['regional']),
 Row(author='HAVEN DALEY and JOHN ROGERS', category=

## Les articles par auteur

In [8]:
spark.catalog.dropTempView("article_table")
df.createTempView("article_table")
spark.sql("SELECT author, title, description FROM article_table GROUP BY author, title, description").collect()

[Row(author='Globe Newswire', title='JOFF Fintech Acquisition Corp. Announces Pricing of upsized $360,000,000 Initial Public Offering', description='New York, NY, Feb. 04, 2021 (GLOBE NEWSWIRE) -- JOFF Fintech Acquisition Corp. (the "Company") today announced the pricing of its upsized initial public offering of 36,000,000 units at$10.00per unit. The units will be listed on the Nasdaq Stock Market and trade under the ticker symbol "JOFFU" begi...'),
 Row(author='ASCD', title='3 Strategies for Promoting Deep Learning Virtually', description='Matthew Perini, Harvey Silver, and Jay McTighe Learning is learning, whether it occurs in a classroom, at a library, or within a virtual environment. But regardless of the venue, learning can vary from superficial to substantive. Many teachers across the country are looking for ways to make online a...'),
 Row(author='Mark Freund', title='Drake’s Murphy Twins In Different Roles, But Still Inseparable', description='DES MOINES  The undefeated Drake B

In [9]:
spark.catalog.dropTempView("article_table")
df.createTempView("article_table")
spark.sql("SELECT author,COUNT(*) as articles FROM article_table GROUP BY author").show()

+--------------------+--------+
|              author|articles|
+--------------------+--------+
|          Josh Moore|       1|
|              sfgate|       1|
|            Zou Shuo|       1|
|               P J P|       2|
|      Business staff|       2|
|        Robert Lemos|       1|
|YURI KAGEYAMA, AP...|       1|
|Anchorage Daily News|       1|
|                 bbc|       4|
|     @womensweeklysg|       3|
|       Stephen Losey|       1|
|  catholicnewsagency|       4|
|   Joshua Clipperton|       1|
|          Parth M.N.|       1|
|      Beth Shilliday|       1|
|          @malaymail|       1|
|           @BrianCoz|       2|
|         Henry, Marc|       1|
|     Stephen Kruiser|       1|
|     Ashleigh Tullis|       1|
+--------------------+--------+
only showing top 20 rows



## Les articles publiés le jour d’un événement particulier

In [10]:
df.filter(df.published == '2021-02-03 21:57:22 +0000').collect()

[]