In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import time
from datetime import datetime
import pandas as pd
import numpy as np

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
raw_html = simple_get("https://forums-enseignants-du-primaire.com/forum/2-les-domaines-d39activit%C3%A9-%C3%A0-l39%C3%A9cole-%C3%A9l%C3%A9mentaire/")
html = BeautifulSoup(raw_html, 'html.parser')

forums = []
for h4 in html.select('h4'):
    #identifying forums
    if h4['class'] == ['ipsDataItem_title','ipsType_large','ipsType_break']:
        for a in h4.select("a"):
            forums.append((a['href'],a.text))

In [4]:
startTime = datetime.now()
print(startTime)

#All pages with topics per forum
pages_with_topics = set()
for (forum_link, forum_name) in forums:
    forum_html_raw = simple_get(forum_link)
    forum_html = BeautifulSoup(forum_html_raw)
    for div in forum_html.findAll('div', class_ = ['ipsButtonBar', 
                                                   'ipsPad_half', 'ipsClearfix_ipsClear']):
        for li in div.findAll('li', class_ = "ipsPagination_last"):
            for a in li.select("a"):
                #identifying last page
                last_page = a['data-page']
                url_last_page = a["href"]
                url_topic = url_last_page.split("page=")[0]
                last_page_nb = int(url_last_page.split("page=")[1])
                for x in map(str,list(range(1,last_page_nb))):
                    pages_with_topics.add((forum_name,url_topic + "page=" + x))

2019-01-16 17:21:36.676428


In [5]:
len(pages_with_topics)

1515

In [6]:
startTime = datetime.now()
print(startTime)

topics = []
for (forum_name, forum_page) in pages_with_topics:
    time.sleep(0.2)
    forum_html_raw = simple_get(forum_page)
    forum_html = BeautifulSoup(forum_html_raw)
    for li in forum_html.findAll('li', class_ = ["ipsDataItem","ipsDataItem_responsivePhoto"]):
        a = li.findAll("a",class_="")[0]
        title = a["title"]
        href = a["href"]
        pages = li.findAll('li', class_ = ["ipsPagination_page"])
        if(len(pages) == 0):
            nb_pages = '0'
        else:
            last_page = li.findAll('li', class_ = ["ipsPagination_last"])
            if(len(last_page) == 0):
                nb_pages = pages[len(pages)-1].text
            else:
                nb_pages = last_page[0].text
        for div in li.findAll("div", class_ = ["ipsDataItem_meta","ipsType_light","ipsType_blendLinks"]):
            for span in div.select("span"):
                posted_by = span.text.replace('\n', '').replace('\t', '')
            for time_date in div.select("time"):
                date_posted = time_date["datetime"]
        for ul in li.findAll("ul", class_ = ["ipsDataItem_stats"]):
            stats_topic = ul.text.replace("\n","")
        topics.append([forum_name,title,href,nb_pages,posted_by,date_posted,stats_topic])
        
print(datetime.now() - startTime)

2019-01-16 17:21:44.064684
0:23:34.368897


In [8]:
len(topics)

60600

In [10]:
df = pd.DataFrame(topics,columns = ["forum_name","topic_name","url_topic","nb_pages","posted_by","date_posted","stats_topic"])

In [11]:
df['date_posted'] = pd.to_datetime(df['date_posted'])
df['year'] = df.date_posted.apply(lambda x: x.year)

In [12]:
df['year'].value_counts().sort_index()

1970       2
2003    2092
2004    4400
2005    5952
2006    6086
2007    6507
2008    7071
2009    6769
2010    5609
2011    4486
2012    3177
2013    2359
2014    1924
2015    1358
2016    1208
2017     945
2018     647
2019       8
Name: year, dtype: int64

In [13]:
df[["replies","vues"]] = df.stats_topic.apply(lambda x: x.replace("vues","").replace("s","")).str.split("répone", expand = True)

In [14]:
df["replies"] = pd.to_numeric(df.replies.apply(lambda x: x.replace(" ","")))

In [15]:
df["vues"] = pd.to_numeric(df.vues.apply(lambda x: x.replace(" ","")))

In [23]:
sum(df.replies)

616141

In [17]:
sum(df.vues)

138001632

In [18]:
df["forum_name"].value_counts()

Français                                                  18480
Organiser, préparer et gérer une classe en élémentaire    14680
Mathématiques                                              6880
Histoire et géographie                                     4040
Sciences et technologies                                   3280
Arts visuels                                               2760
Musique                                                    2760
Langues                                                    2560
Thèmes et projets pour l'élémentaire                       2520
E.P.S.                                                     1320
Enseignement moral et civique                               760
TICE à l'école élémentaire                                  440
L'élémentaire au jour le jour                               120
Name: forum_name, dtype: int64

In [19]:
df.groupby("forum_name").sum().sort_values("replies", ascending=False).drop("year",axis=1)

Unnamed: 0_level_0,replies,vues
forum_name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Organiser, préparer et gérer une classe en élémentaire",193191,31416243
Français,130550,37033115
L'élémentaire au jour le jour,98436,7245988
Thèmes et projets pour l'élémentaire,52644,2334371
Mathématiques,42482,15391805
Histoire et géographie,21760,9003946
Arts visuels,16174,9077479
Sciences et technologies,15978,6396817
Langues,15736,5531530
Musique,14665,7902852


In [22]:
df['posted_by'].nunique()

12567

In [20]:
df.to_csv("/media/igna/Data/CRI_IA/Teacher_to_teacher/forum.csv")

https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

https://stackoverflow.com/questions/13131139/lemmatize-french-text

http://www.nltk.org/api/nltk.stem.html#module-nltk.stem.regexp