# Scraping Community (2009)'s IMDb Ratings

## Import necessary libraries

In [10]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd 

## Scrape data

In [11]:
# Initializing the series' that the loop will 
community_episodes = []

# For every season in the series
for sn in range(1,7):
    # Request the server the content of the web page by using get(), and store the server’s response in the variable response
    response = get('https://www.imdb.com/title/tt1439629/episodes?season=' + str(sn))

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the episode containers from the season page
    episode_containers = page_html.find_all('div', class_ = 'info')

    # For each episode in each season
    for episodes in episode_containers:
            # Get the info of each episode on the page
            season = sn
            episode_number = episodes.meta['content']
            title = episodes.a['title']
            airdate = episodes.find('div', class_='airdate').text.strip()
            rating = episodes.find('span', class_='ipl-rating-star__rating').text
            total_votes = episodes.find('span', class_='ipl-rating-star__total-votes').text
            desc = episodes.find('div', class_='item_description').text.strip()
            # Compiling the episode info
            episode_data = [season, episode_number, title, airdate, rating, total_votes, desc]

            # Append the episode info to the complete dataset
            community_episodes.append(episode_data)

In [12]:
community_episodes = pd.DataFrame(community_episodes, columns = ['season', 'episode_number', 'title', 'airdate', 'rating', 'total_votes', 'desc'])

community_episodes

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Pilot,17 Sep. 2009,7.8,"(3,182)",An ex-lawyer is forced to return to community ...
1,1,2,Spanish 101,24 Sep. 2009,7.9,"(2,755)",Jeff takes steps to ensure that Brita will be ...
2,1,3,Introduction to Film,1 Oct. 2009,8.3,"(2,695)",Brita comes between Abed and his father when s...
3,1,4,Social Psychology,8 Oct. 2009,8.2,"(2,472)",Jeff and Shirley bond by making fun of Britta'...
4,1,5,Advanced Criminal Law,15 Oct. 2009,7.9,"(2,374)",Señor Chang is on the hunt for a cheater and t...
...,...,...,...,...,...,...,...
105,6,9,Grifting 101,5 May 2015,7.8,"(1,364)","When a clever clever con man, Professor DeSalv..."
106,6,10,Basic RV Repair and Palmistry,12 May 2015,7.6,"(1,360)",While on a road trip to get rid of a Greendale...
107,6,11,Modern Espionage,19 May 2015,9.0,"(1,833)",When a secret game of paintball occurs on camp...
108,6,12,Wedding Videography,26 May 2015,8.6,"(1,564)",A marriage proposal in Jeff's law class expose...


## Data Cleaning
### Converting the total votes count to numeric

First, we create a function that replaces the ',' , '(', and ')' strings from `total_votes` so that we can make it numeric.

In [13]:
def remove_str(votes):
    for r in ((',',''), ('(',''),(')','')):
        votes = votes.replace(*r)
        
    return votes

Now we apply the function, taking out the strings, then change the type to int using `.astype()`

In [14]:
community_episodes['total_votes'] = community_episodes.total_votes.apply(remove_str).astype(int)

community_episodes.head()

Unnamed: 0,season,episode_number,title,airdate,rating,total_votes,desc
0,1,1,Pilot,17 Sep. 2009,7.8,3182,An ex-lawyer is forced to return to community ...
1,1,2,Spanish 101,24 Sep. 2009,7.9,2755,Jeff takes steps to ensure that Brita will be ...
2,1,3,Introduction to Film,1 Oct. 2009,8.3,2695,Brita comes between Abed and his father when s...
3,1,4,Social Psychology,8 Oct. 2009,8.2,2472,Jeff and Shirley bond by making fun of Britta'...
4,1,5,Advanced Criminal Law,15 Oct. 2009,7.9,2374,Señor Chang is on the hunt for a cheater and t...


### Making `rating` numeric instead of a string

In [15]:
community_episodes['rating'] = community_episodes.rating.astype(float)

### Converting the `airdate` from string to datetime

In [16]:
community_episodes['airdate'] = pd.to_datetime(community_episodes.airdate)

community_episodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   season          110 non-null    int64         
 1   episode_number  110 non-null    object        
 2   title           110 non-null    object        
 3   airdate         110 non-null    datetime64[ns]
 4   rating          110 non-null    float64       
 5   total_votes     110 non-null    int32         
 6   desc            110 non-null    object        
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(3)
memory usage: 5.7+ KB


Now the data is ready for analysis and visualization!

In [17]:
community_episodes.to_csv('Community_Episodes_IMDb_Ratings.csv',index=False)