In [1]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import re
from pycountry import countries

In [2]:
htmls = []
for i in range(1,18):
    resp = urllib.request.urlopen(f"https://en.wikipedia.org/wiki/NCIS_(season_{i})")
    htmls.append(resp.read().decode())

In [3]:
h = htmls[1]
s = BeautifulSoup(h)

In [4]:
t = s.find("table", class_="wikiepisodetable")
trs = t.tbody.findAll("tr")

In [5]:
tr = trs[1]
list(tr.children)

[<th id="ep24" rowspan="1" scope="row" style="text-align:center">24</th>,
 <td style="text-align:center">1</td>,
 <td class="summary" style="text-align:left">"See No Evil"</td>,
 <td style="text-align:center"><a href="/wiki/Thomas_J._Wright" title="Thomas J. Wright">Thomas J. Wright</a></td>,
 <td style="text-align:center">Chris Crowe</td>,
 <td style="text-align:center">September 28, 2004<span style="display:none"> (<span class="bday dtstart published updated">2004-09-28</span>)</span></td>,
 <td id="pc201" style="text-align:center">201</td>,
 <td style="text-align:center">14.33<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup></td>]

In [6]:
head = ["no", "season", "no_in_season", "title", "air_date", "us_viewers", "text"]
data = {h:[] for h in head}
for i, html in enumerate(htmls):
    soup = BeautifulSoup(html)
    season = i+1
    
    table = soup.find("table", class_="wikiepisodetable")
    trs = table.tbody.findAll("tr")
    
    it = iter([["".join(td.stripped_strings) for td in tr] for tr in trs[1:]])
    for tr in it:
        data["season"].append(season)
        if len(tr) == 7 and season==1:
            data["no"].append(int(tr[0]))
        elif len(tr) == 8 or season==17:
            data["no"].append(int(tr.pop(0)))            
        else:
            raise Exception(f"Not expected: len(tr) == {len(tr)} in season {season}")
        
        data["no_in_season"].append(int(tr[0]))
        data["title"].append(tr[1])
        try:
            date = re.match(r".*\((.*)\)", tr[4]).groups()[0]
        except AttributeError:
            print(f"weird date in season {season}, no. {tr[0]}: {tr[4]}")
            date = tr[4]
        data["air_date"].append(date)
        viewers = float(re.match(r"(.*)\[.*\]", tr[-1]).groups()[0])
        data["us_viewers"].append(viewers)
        data["text"].append(next(it)[0])

In [7]:
df = pd.DataFrame(data, columns=head)
df["air_date"] = [pd.Timestamp(d) for d in df["air_date"]]
df.head()

Unnamed: 0,no,season,no_in_season,title,air_date,us_viewers,text
0,1,1,1,"""Yankee White""",2003-09-23,13.04,"While onAir Force One, a Navy commander, Ray T..."
1,2,1,2,"""Hung Out to Dry""",2003-09-30,12.08,A Marine (Brian Patrick Wade) dies during a ni...
2,3,1,3,"""Seadog""",2003-10-07,11.26,"When a driver-less boat and several bodies, in..."
3,4,1,4,"""The Immortals""",2003-10-14,11.7,The discovery of a drowned sailor in dress whi...
4,5,1,5,"""The Curse""",2003-10-28,13.5,Gibbs and the team are called in when a mummif...


In [8]:
df.dtypes

no                       int64
season                   int64
no_in_season             int64
title                   object
air_date        datetime64[ns]
us_viewers             float64
text                    object
dtype: object

In [9]:
df.to_pickle("data.pkl")