In [1]:
import pandas as pd

In [2]:
# let's load and preview the data
df = pd.read_csv('data/penny_arcade.csv')
df.head()

Unnamed: 0,id,date,title,text,author,tags,comic
0,1,2003-01-01,Scott Kurtz,\nI got a lot of mail asking me to respond to ...,Johnathan Gabriel,,https://www.penny-arcade.com/comic/2003/01/01/
1,2,2003-01-03,Exhilarating,\nWe only know two people who don't live withi...,Tycho Brahe,,https://www.penny-arcade.com/comic/2003/01/03/
2,3,2003-01-04,WOO!,\nI talked to the lady from Entertainment Week...,Johnathan Gabriel,,https://www.penny-arcade.com/comic/2003/01/04/
3,4,2003-01-06,Penny Arcade is A Comedy Bistro,"\nOver at Gabe's yesterday, I mentioned that G...",Tycho Brahe,,https://www.penny-arcade.com/comic/2003/01/06/
4,5,2003-01-08,Minus The Pope And A Rabbi,"\nI thought that I had uploaded the comic, but...",Tycho Brahe,,https://www.penny-arcade.com/comic/2003/01/08/


In [3]:
# for the text column, some rows have a leading \n character
# let's remove that
df['text'] = df['text'].str.lstrip()

df.head()

Unnamed: 0,id,date,title,text,author,tags,comic
0,1,2003-01-01,Scott Kurtz,I got a lot of mail asking me to respond to Sc...,Johnathan Gabriel,,https://www.penny-arcade.com/comic/2003/01/01/
1,2,2003-01-03,Exhilarating,We only know two people who don't live within ...,Tycho Brahe,,https://www.penny-arcade.com/comic/2003/01/03/
2,3,2003-01-04,WOO!,I talked to the lady from Entertainment Weekly...,Johnathan Gabriel,,https://www.penny-arcade.com/comic/2003/01/04/
3,4,2003-01-06,Penny Arcade is A Comedy Bistro,"Over at Gabe's yesterday, I mentioned that Gui...",Tycho Brahe,,https://www.penny-arcade.com/comic/2003/01/06/
4,5,2003-01-08,Minus The Pope And A Rabbi,"I thought that I had uploaded the comic, but I...",Tycho Brahe,,https://www.penny-arcade.com/comic/2003/01/08/


In [4]:
# discard rows where author is not Tycho Brahe
df = df[df['author'] == 'Tycho Brahe']

# discard rows where text is empty
df = df[df['text'] != '']

# drop unnecessary columns: id, date, author, comic
df = df.drop(columns=['id', 'date', 'author', 'comic'])

# rebuild the index
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,title,text,tags
0,Exhilarating,We only know two people who don't live within ...,
1,Penny Arcade is A Comedy Bistro,"Over at Gabe's yesterday, I mentioned that Gui...",
2,Minus The Pope And A Rabbi,"I thought that I had uploaded the comic, but I...",
3,Jealousy,"It's fine and everything, but there's all thes...",
4,I Forgot,My man Pork mentioned this Star Wars fan-film ...,


In [5]:
# combine the title and tags columns, separated by a comma
# howerver, make sure tags != NaN
df['title'] = df['title'] + ', ' + df['tags'].fillna('')

# drop the tags column
df = df.drop(columns=['tags'])

# df.tail()
df.head()

Unnamed: 0,title,text
0,"Exhilarating,",We only know two people who don't live within ...
1,"Penny Arcade is A Comedy Bistro,","Over at Gabe's yesterday, I mentioned that Gui..."
2,"Minus The Pope And A Rabbi,","I thought that I had uploaded the comic, but I..."
3,"Jealousy,","It's fine and everything, but there's all thes..."
4,"I Forgot,",My man Pork mentioned this Star Wars fan-film ...


In [6]:
# now let's save the data
df.to_csv('data/penny_arcade_processed.csv', index=False)