In [None]:
'''
scraping best movie from rottentomatoes page using beautiful soup library
'''

import requests
from bs4 import BeautifulSoup
import re
import numpy
import pandas


def parse_page():
    '''
    get and parse a page
    we are scraping best action movies from rottentomatoes
    '''
    response = requests.get('https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/')
    # choosing lxml parser
    soup = BeautifulSoup(response.content, 'lxml')
    return soup
    
    
def scraped_page():
    '''
    scraping all desired element from parsed page
    '''
    #parsed page
    page = parse_page()
    #scrape all movie items
    all_movies = soup.find_all('div',{'class':'row countdown-item'})
    # overall rating
    overall_rating = soup.find_all('div', {'class': 'countdown-index'})
    # user rating
    user_rating = soup.find_all('span', {'class':'tMeterScore'})
    # year released
    year = soup.find_all('span', {'class':'subtle start-year'})
    #criticism consensus
    # all movie names
    all_titles = [(all_movies[i].find('div',{'class':'article_movie_title'}).find('a')).text for i in range(len(all_movies))]
    #criticism consensus
    criticism_consensus = [(all_movies[i].find('div',{'class':'info critics-consensus'})).text for i in range(len(all_movies))]

    re.sub("[/()]","", year[0].text)
    return [overall_rating, user_rating, year, all_titles, criticism_consensus]


def text_processing(table):
    '''
    getting rid of unwanted punctuating and html tags
    '''
    
    # get rid of unwanted characters in our year
    table['year'] = table['year'].apply(lambda row: re.sub("[/()]","", row.text))
    # get rid of unwanted characters in overall rating
    table['overall_rating'] = table['overall_rating'].apply(lambda row: re.sub("[/()#]","", row.text))
    # get rid of unwanted characters in user_rating
    table['user_rating'] = table['user_rating'].apply(lambda row: re.sub("[/()]","", row.text))
    # and get rid of critics consensus
    table['critics consensus description'] = table['critics consensus description'].apply(lambda row: str(row).replace("Critics Consensus:",""))

    #line up text in critics consensus description to left, be carefull it return styler object
    # best_140_movies = best_140_movies.style.set_properties(**{'text-align': 'center'})
    return table
    

def table_of_best_movies():
    '''
    main funtion
    creating DataFrame (Pandas) with all movies and affilieted features, namely: 
    movie name, year, overall rating, user rating and critics review
    also we're conductiong text processing using 
    '''
   
    
    #all columns in one list
    all_columns = scraped_page()
    #creating single columns
    overall_rating = all_columns[0]
    user_rating = all_columns[1]
    year = all_columns[2]
    all_titles = all_columns[3]
    criticism_consensus = all_columns[4]
    

    columns = {'movie_name': [],
              'year': '',
              'overall_rating': '',
              'user_rating': [],
              'critics consensus description': []
              }


    # creating our table
    # all movies will be in ascending order - staring with best rated
    best_140_movies = pd.DataFrame(columns)
    best_140_movies['movie_name'] = all_titles[::-1]
    best_140_movies['year'] = year[::-1]
    best_140_movies['overall_rating'] = overall_rating[::-1]
    best_140_movies['critics consensus description'] = criticism_consensus[::-1]
    best_140_movies['user_rating'] = user_rating[::-1]

    #set up indices from 1
    best_140_movies.index = np.arange(1, len(best_140_movies)+1)
    
    # processing and cleaning data - getting rid of some html tags and striping our some punctuation
    final_table = text_processing(best_140_movies)
    return final_table

    
if __name__ == '__main__':
    result = table_of_best_movies()
    print(result)