In [2]:
import sqlite3
import uuid
import os
import pandas as pd

In [3]:
db_path = os.getcwd() + '/data/sens_critique_db.db'

In [4]:
con = sqlite3.connect(db_path)
cursor = con.cursor()

In [6]:
users = pd.read_sql("select * from users", con=con)
users = users.drop_duplicates(subset=['user_url', 'user_name'])

In [7]:
query = """
    select distinct(user_name) from users_movies
"""

exclude_users_movies_df = pd.read_sql(query, con=con)
users = users[~users.user_name.isin(exclude_users_movies_df.user_name)]

In [14]:
users.sort_values(by='user_nb_notes', ascending=True).groupby(['user_nb_notes'])['user_id'].count()

user_nb_notes
0        704
1        147
2         63
3         41
4         34
        ... 
27742      1
32083      1
32297      1
33372      1
33538      1
Name: user_id, Length: 4179, dtype: int64

In [20]:
query = """
CREATE TABLE users_movies(
    movie_id PRIMARY KEY,
    user_id TEXT NOT NULL,
    user_url TEXT NOT NULL,
    user_name TEXT NOT NULL,
    movie_url TEXT NOT NULL,
    movie_name TEXT NOT NULL,
    user_rating INT NOT NULL,
    user_interest INT NOT NULL,
    global_rating FLOAT NOT NULL,
    url_critic TEXT NOT NULL,
    nb_raters INT NOT NULL
)

"""

In [21]:
cursor.execute(query)
cursor.close()

In [5]:
users = pd.read_sql("select * from users", con=con)

In [6]:
users

Unnamed: 0,user_id,user_url,user_name,user_nb_followers,user_nb_notes
0,10a1349a-706e-4072-94a5-779a385276f3,https://www.senscritique.com/Sergent_Pepper,Sergent_Pepper,9393,7423
1,02e2a48d-3725-4cf2-8bdf-cc346abfc813,https://www.senscritique.com/Karim_Debbache-Privé,Karim_Debbache-Privé,6915,1170
2,5ff2bb1b-5e24-4a96-afbf-caf17bb0f4c1,https://www.senscritique.com/Plug_In_Papa,Plug_In_Papa,6235,6307
3,ee3cc22b-cfc6-447f-a4a4-422aa999c4b4,https://www.senscritique.com/Inthepanda,Inthepanda,6061,2527
4,2501bfc4-e4fe-4d25-ac74-2e5483f750cd,https://www.senscritique.com/Hypérion,Hypérion,5610,3340
...,...,...,...,...,...
9995,dd59973c-6c01-4628-8948-e46be2df417b,https://www.senscritique.com/Benelie,Benelie,42,1126
9996,4032c329-264a-4b95-a6b6-6c1c0c0b3af7,https://www.senscritique.com/rgmathieu,rgmathieu,42,260
9997,4146156d-b744-4984-9ea7-81f6345b2063,https://www.senscritique.com/Robin_Gross,Robin_Gross,42,794
9998,ddd441d3-4584-416b-8a7c-a68293e64115,https://www.senscritique.com/Jahlostgang,Jahlostgang,42,1096


In [None]:
users = pd.read_sql("select * from users", con=con)

In [None]:
users = users.drop_duplicates(subset=['user_url', 'user_name'])

In [None]:
users

In [None]:
import time
import requests
from user_agent import generate_user_agent
from bs4 import BeautifulSoup as soup
import logging

logging.basicConfig(level=logging.INFO)


def get_code(url):
    """Return the code html"""
    # Define the user agent
    headers = {'User-Agent': generate_user_agent(device_type="desktop",
                                                 os=('mac', 'linux'))}
    # Open the url file and get the html code of the page
    req = requests.get(url, headers=headers)
    time.sleep(1)
    return soup(req.text, "lxml")


In [None]:
user_name = users.iloc[0]['user_name']

In [None]:
url = f'https://www.senscritique.com/{user_name}/collection/all/films/all/all/all/all/all/all/all/page-1'

In [None]:
html_code = get_code(url=url)

In [None]:
def get_user_rating(movie_info):
    tmp = movie_info('div', 'elco-collection-rating user')[0]

    try:
        if tmp.get_text().strip() == '':
            user_rating = 1
        else:
            user_rating = int(tmp.get_text().strip())
    except Exception as e:
        logging.info(f'Movie - get user rating - Error: {e}')
        user_rating = None
    return user_rating


def get_user_movie_info(movie_info):
    content = movie_info('div', 'elco-collection-content collection')

    try:
        movie_url = 'https://www.senscritique.com' + content[0].a['href']
    except Exception as e:
        logging.info(f'Movie - get movie url - Error: {e}')
        movie_url = None

    try:
        movie_name = content[0].a.get_text()
    except Exception as e:
        logging.info(f'Movie - get movie name - Error: {e}')
        movie_name = None

    try:
        movie_id = content[0].a['id']
    except Exception as e:
        logging.info(f'Movie - get movie id - Error: {e}')
        movie_id = None

    return movie_url, movie_name, movie_id


def get_movie_global_rating(movie_info):
    content = movie_info('div', 'erra user')[0]
    try:
        global_rating = float(content.a.get_text().strip())
    except Exception as e:
        logging.info(f'Movie - get global rating - Error: {e}')
        global_rating = None

    try:
        url_critic = 'https://www.senscritique.com' + content.a['href']
    except Exception as e:
        logging.info(f'Movie - get critic url - Error: {e}')
        url_critic = None

    try:
        nb_raters = int(content.a['title'].split(':')[-1].replace('avis', '').strip())
    except Exception as e:
        logging.info(f'Movie - get global rating - Error: {e}')
        nb_raters = None

    return global_rating, url_critic, nb_raters

In [None]:
def get_nb_pages(html_code):
    try:
        nb_page = html_code('li', 'eipa-page')[-1].a['data-sc-pager-page']
    except Exception as e:
        nb_page = 100
        logging.info(f'Movie - get total nb of pages - Error: {e}. Setup {nb_page} pages by default')
    return int(nb_page)

In [None]:
def generate_urls(user_name, html_code):
    nb_page = get_nb_pages(html_code=html_code)
    return [f'https://www.senscritique.com/{user_name}/collection/all/films/all/all/all/all/all/all/all/page-{i+1}'
            for i in range(nb_page)]
    
    

In [None]:
def get_all_users_movies_from_one_page(html_code):
    
    all_movie_info = html_code('li', 'elco-collection-item')
    
    users_movies_info = {'movie_url': [],
                         'movie_name': [],
                         'movie_id': [],
                         'user_rating': [],
                         'user_interest': [],
                         'global_rating': [],
                         'url_critic': [],
                         'nb_raters': [],
                        }
    
    for movie_info in all_movie_info:
        
        user_rating = get_user_rating(movie_info=movie_info)
        
        if user_rating == 1:
            users_movies_info["user_rating"].append(None)
            users_movies_info["user_interest"].append(1)
        else:
            users_movies_info["user_rating"].append(user_rating)
            users_movies_info["user_interest"].append(0)
            
        movie_url, movie_name, movie_id = get_user_movie_info(movie_info=movie_info)
        global_rating, url_critic, nb_raters = get_movie_global_rating(movie_info=movie_info)
        
        users_movies_info["movie_url"].append(movie_url)
        users_movies_info["movie_name"].append(movie_name)
        users_movies_info["movie_id"].append(movie_id)
        users_movies_info["global_rating"].append(global_rating)
        users_movies_info["url_critic"].append(url_critic)
        users_movies_info["nb_raters"].append(nb_raters)
    
    return pd.DataFrame(users_movies_info)
        
        

In [None]:
pd.DataFrame(df).columns